In [92]:
from collections import Counter
import numpy as np
import pandas as pd
import re
import glob
import gzip
import json
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

In [93]:
def load_data(datafile):
    """
    Read your data into a single pandas dataframe where
    - each row is an instance to be classified
    (this could be a tweet, user, or news article, depending on your project)
    - there is a column called `label` which stores the class label (e.g., the true
      category for this row)
    """
    bots = []
    humans = []
    folder = ['/bots', '/humans']
    name = '/*.json.gz'
    for f in folder:
        paths = glob.glob(datafile + f + name)
        for p in paths:
            with gzip.open(p, 'r') as file:
                for line in file:
                    if f == folder[0]:
                        bots.append(json.loads(line))
                    elif f == folder[1]:
                        humans.append(json.loads(line))
    df_bots = pd.DataFrame(bots)[['screen_name','tweets','listed_count']]
    df_bots['label'] = 'bot'
    df_humans = pd.DataFrame(humans)[['screen_name','tweets','listed_count']]
    df_humans['label'] = 'human'
    frames = [df_bots, df_humans]
    df = pd.concat(frames)
    users = bots + humans
    tweets = [u['tweets'] for u in users]
    text = [d['full_text'] for t in tweets for d in t] 

#     tweets_avg_len = []
    tweets_avg_mentions = []
    tweets_avg_urls = []
    factor = 100
    for u in users:
        tweets = u['tweets'] # a list of dicts
        texts = [t['full_text'] for t in tweets]
#         avg_len = sum(map(len, texts))/len(texts)
#         tweets_avg_len.append(int(avg_len))
        count_mention = 0
        count_url = 0
        for s in texts:
            if 'http' in s:
                count_url+=1
            if '@' in s:
                count_mention+=1
        tweets_avg_urls.append(100 * count_url / len(texts))
        tweets_avg_mentions.append(100 * count_mention / len(texts))
#     df['tweets_avg_len'] = tweets_avg_len
    df['tweets_avg_urls'] = tweets_avg_urls
    df['tweets_avg_mentions'] = tweets_avg_mentions
    return df
# df = load_data('~/Dropbox/elevate/harassment/training_data/data.csv.gz')
df = load_data('/Users/lcj/small')
df

Unnamed: 0,screen_name,tweets,listed_count,label,tweets_avg_urls,tweets_avg_mentions
0,carlos_eggbot,[{'created_at': 'Sat Jun 01 18:36:07 +0000 201...,0,bot,10.500000,0.000000
1,ecolo_ebooks,[{'created_at': 'Sat Jun 01 18:36:11 +0000 201...,2,bot,0.000000,0.000000
2,AllStarSMBot,[{'created_at': 'Sat Jun 01 18:36:28 +0000 201...,3,bot,0.000000,0.000000
3,saionji_en,[{'created_at': 'Sat Jun 01 18:36:52 +0000 201...,3,bot,0.000000,0.000000
4,KimClune,[{'created_at': 'Sat Jun 01 18:37:20 +0000 201...,329,bot,28.500000,2.500000
5,CatsDogsBOT,[{'created_at': 'Sat Jun 01 18:38:10 +0000 201...,3,bot,100.000000,0.000000
6,bluejovanka,[{'created_at': 'Sat Jun 01 18:38:14 +0000 201...,47,bot,37.688442,32.160804
7,anittavota4,[{'created_at': 'Sat Jun 01 18:39:19 +0000 201...,0,bot,0.000000,0.000000
8,justtraveluk,[{'created_at': 'Sat Jun 01 18:39:21 +0000 201...,11,bot,100.000000,0.000000
9,rhaudiencebot,[{'created_at': 'Sat Jun 01 18:40:08 +0000 201...,0,bot,0.000000,0.000000


In [94]:
# what is the distribution over class labels?
df.label.value_counts()
df.dtypes

screen_name             object
tweets                  object
listed_count             int64
label                   object
tweets_avg_urls        float64
tweets_avg_mentions    float64
dtype: object

In [95]:
def make_features(df):
    vec = DictVectorizer()
    feature_dicts = []
    labels_to_track = ['tweets_avg_urls', 'tweets_avg_mentions','listed_count']
    for i, row in df.iterrows():
        features = {}
        features['tweets_avg_urls'] = row['tweets_avg_urls']
        features['tweets_avg_mentions'] = row['tweets_avg_mentions']
        features['listed_count'] = row['listed_count']
        feature_dicts.append(features)
    X = vec.fit_transform(feature_dicts)
#     print(X)
    return X, vec

X, vec = make_features(df)

In [96]:
# what are dimensions of the feature matrix?
X.shape

(200, 3)

In [97]:
# what are the feature names?
# vocabulary_ is a dict from feature name to column index
vec.vocabulary_

{'tweets_avg_urls': 2, 'tweets_avg_mentions': 1, 'listed_count': 0}

In [98]:
# how often does each word occur?
for word, idx in vec.vocabulary_.items():
    print('%20s\t%d' % (word, X[:,idx].sum()))

     tweets_avg_urls	9149
 tweets_avg_mentions	7028
        listed_count	147695


In [99]:
# can also get a simple list of feature names:
vec.get_feature_names()

# e.g., first column is 'hate', second is 'love', etc.

['listed_count', 'tweets_avg_mentions', 'tweets_avg_urls']

In [100]:
# we'll first store the classes separately in a numpy array
y = np.array(df.label)
Counter(y)

Counter({'bot': 100, 'human': 100})

In [101]:
# to find the row indices with hostile label
np.where(y=='bot')[0]
# np.where(y=='human')[0]

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])

In [102]:
# store the class names
class_names = set(df.label)

In [103]:
# how often does each word appear in each class?
for word, idx in vec.vocabulary_.items():
    for class_name in class_names:
        class_idx = np.where(y==class_name)[0]
        print('%20s\t%20s\t%d' % (word, class_name, X[class_idx, idx].sum()))

     tweets_avg_urls	               human	5491
     tweets_avg_urls	                 bot	3657
 tweets_avg_mentions	               human	6123
 tweets_avg_mentions	                 bot	905
        listed_count	               human	145344
        listed_count	                 bot	2351


In [119]:
a= [10,50,100,200]
b = [ 0.001, 0.0001, 0.00001]
for hidden_layer_sizes in a:
    for alpha in b:
        clf = MLPClassifier(hidden_layer_sizes = (50,) , activation='relu', solver='adam', alpha=0.0001)
        clf.fit(X, y)
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        accuracies = []
        for train, test in kf.split(X):
            clf.fit(X[train], y[train])
            pred = clf.predict(X[test])
            accuracies.append(accuracy_score(y[test], pred))
            clf.coefs_
            print('accuracy over all cross-validation folds: %s' % str(accuracies))
            print('mean=%.2f std=%.2f' % (np.mean(accuracies), np.std(accuracies)))



accuracy over all cross-validation folds: [0.925]
mean=0.93 std=0.00




accuracy over all cross-validation folds: [0.925, 0.95]
mean=0.94 std=0.01




accuracy over all cross-validation folds: [0.925, 0.95, 0.825]
mean=0.90 std=0.05
accuracy over all cross-validation folds: [0.925, 0.95, 0.825, 0.775]
mean=0.87 std=0.07




accuracy over all cross-validation folds: [0.925, 0.95, 0.825, 0.775, 0.9]
mean=0.88 std=0.07




accuracy over all cross-validation folds: [0.85]
mean=0.85 std=0.00
accuracy over all cross-validation folds: [0.85, 0.9]
mean=0.88 std=0.03
accuracy over all cross-validation folds: [0.85, 0.9, 0.8]
mean=0.85 std=0.04
accuracy over all cross-validation folds: [0.85, 0.9, 0.8, 0.675]
mean=0.81 std=0.08




accuracy over all cross-validation folds: [0.85, 0.9, 0.8, 0.675, 0.95]
mean=0.83 std=0.09




accuracy over all cross-validation folds: [0.975]
mean=0.97 std=0.00
accuracy over all cross-validation folds: [0.975, 0.9]
mean=0.94 std=0.04




accuracy over all cross-validation folds: [0.975, 0.9, 0.85]
mean=0.91 std=0.05




accuracy over all cross-validation folds: [0.975, 0.9, 0.85, 1.0]
mean=0.93 std=0.06
accuracy over all cross-validation folds: [0.975, 0.9, 0.85, 1.0, 0.775]
mean=0.90 std=0.08




accuracy over all cross-validation folds: [0.975]
mean=0.97 std=0.00




accuracy over all cross-validation folds: [0.975, 0.95]
mean=0.96 std=0.01




accuracy over all cross-validation folds: [0.975, 0.95, 0.825]
mean=0.92 std=0.07
accuracy over all cross-validation folds: [0.975, 0.95, 0.825, 0.575]
mean=0.83 std=0.16




accuracy over all cross-validation folds: [0.975, 0.95, 0.825, 0.575, 0.95]
mean=0.86 std=0.15




accuracy over all cross-validation folds: [0.975]
mean=0.97 std=0.00
accuracy over all cross-validation folds: [0.975, 0.7]
mean=0.84 std=0.14




accuracy over all cross-validation folds: [0.975, 0.7, 0.775]
mean=0.82 std=0.12
accuracy over all cross-validation folds: [0.975, 0.7, 0.775, 0.775]
mean=0.81 std=0.10




accuracy over all cross-validation folds: [0.975, 0.7, 0.775, 0.775, 0.95]
mean=0.83 std=0.11




accuracy over all cross-validation folds: [0.975]
mean=0.97 std=0.00
accuracy over all cross-validation folds: [0.975, 0.925]
mean=0.95 std=0.02
accuracy over all cross-validation folds: [0.975, 0.925, 0.825]
mean=0.91 std=0.06




accuracy over all cross-validation folds: [0.975, 0.925, 0.825, 1.0]
mean=0.93 std=0.07




accuracy over all cross-validation folds: [0.975, 0.925, 0.825, 1.0, 0.925]
mean=0.93 std=0.06
accuracy over all cross-validation folds: [0.95]
mean=0.95 std=0.00




accuracy over all cross-validation folds: [0.95, 0.95]
mean=0.95 std=0.00




accuracy over all cross-validation folds: [0.95, 0.95, 0.85]
mean=0.92 std=0.05




accuracy over all cross-validation folds: [0.95, 0.95, 0.85, 1.0]
mean=0.94 std=0.05
accuracy over all cross-validation folds: [0.95, 0.95, 0.85, 1.0, 0.725]
mean=0.89 std=0.10
accuracy over all cross-validation folds: [0.875]
mean=0.88 std=0.00




accuracy over all cross-validation folds: [0.875, 0.95]
mean=0.91 std=0.04
accuracy over all cross-validation folds: [0.875, 0.95, 0.825]
mean=0.88 std=0.05
accuracy over all cross-validation folds: [0.875, 0.95, 0.825, 0.625]
mean=0.82 std=0.12




accuracy over all cross-validation folds: [0.875, 0.95, 0.825, 0.625, 0.9]
mean=0.83 std=0.11




accuracy over all cross-validation folds: [0.975]
mean=0.97 std=0.00
accuracy over all cross-validation folds: [0.975, 0.9]
mean=0.94 std=0.04




accuracy over all cross-validation folds: [0.975, 0.9, 0.775]
mean=0.88 std=0.08
accuracy over all cross-validation folds: [0.975, 0.9, 0.775, 0.975]
mean=0.91 std=0.08




accuracy over all cross-validation folds: [0.975, 0.9, 0.775, 0.975, 0.925]
mean=0.91 std=0.07




accuracy over all cross-validation folds: [0.925]
mean=0.93 std=0.00
accuracy over all cross-validation folds: [0.925, 0.925]
mean=0.93 std=0.00




accuracy over all cross-validation folds: [0.925, 0.925, 0.85]
mean=0.90 std=0.04
accuracy over all cross-validation folds: [0.925, 0.925, 0.85, 1.0]
mean=0.93 std=0.05
accuracy over all cross-validation folds: [0.925, 0.925, 0.85, 1.0, 0.7]
mean=0.88 std=0.10




accuracy over all cross-validation folds: [0.975]
mean=0.97 std=0.00




accuracy over all cross-validation folds: [0.975, 0.9]
mean=0.94 std=0.04
accuracy over all cross-validation folds: [0.975, 0.9, 0.575]
mean=0.82 std=0.17
accuracy over all cross-validation folds: [0.975, 0.9, 0.575, 1.0]
mean=0.86 std=0.17




accuracy over all cross-validation folds: [0.975, 0.9, 0.575, 1.0, 0.95]
mean=0.88 std=0.16
accuracy over all cross-validation folds: [0.675]
mean=0.68 std=0.00
accuracy over all cross-validation folds: [0.675, 0.85]
mean=0.76 std=0.09
accuracy over all cross-validation folds: [0.675, 0.85, 0.625]
mean=0.72 std=0.10
accuracy over all cross-validation folds: [0.675, 0.85, 0.625, 0.95]
mean=0.77 std=0.13
accuracy over all cross-validation folds: [0.675, 0.85, 0.625, 0.95, 0.8]
mean=0.78 std=0.12




In [120]:
c= [1, 3, 5]
d = [100, 200, 300]
for min_samples_leaf in c:
    for n_estimators in d:
        rand = RandomForestClassifier(n_estimators=n_estimators, min_samples_leaf= min_samples_leaf)
        rand.fit(X, y)
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        accuracies = []
        for train, test in kf.split(X):
            rand.fit(X[train], y[train])
            pred = rand.predict(X[test])
            accuracies.append(accuracy_score(y[test], pred))
            print(rand.feature_importances_)
            print('accuracy over all cross-validation folds: %s' % str(accuracies))
            print('mean=%.2f std=%.2f' % (np.mean(accuracies), np.std(accuracies)))

[0.47117383 0.31309028 0.2157359 ]
accuracy over all cross-validation folds: [0.95]
mean=0.95 std=0.00
[0.46093527 0.34502816 0.19403657]
accuracy over all cross-validation folds: [0.95, 0.925]
mean=0.94 std=0.01
[0.45467624 0.34110692 0.20421684]
accuracy over all cross-validation folds: [0.95, 0.925, 0.825]
mean=0.90 std=0.05
[0.4478776  0.33363814 0.21848426]
accuracy over all cross-validation folds: [0.95, 0.925, 0.825, 0.975]
mean=0.92 std=0.06
[0.40953821 0.33740839 0.2530534 ]
accuracy over all cross-validation folds: [0.95, 0.925, 0.825, 0.975, 0.925]
mean=0.92 std=0.05
[0.42626602 0.32485661 0.24887737]
accuracy over all cross-validation folds: [0.95]
mean=0.95 std=0.00
[0.40642281 0.36330754 0.23026966]
accuracy over all cross-validation folds: [0.95, 0.9]
mean=0.93 std=0.02
[0.46532309 0.31380101 0.2208759 ]
accuracy over all cross-validation folds: [0.95, 0.9, 0.85]
mean=0.90 std=0.04
[0.40455096 0.35178289 0.24366615]
accuracy over all cross-validation folds: [0.95, 0.9, 0