In [1]:
from collections import Counter
import numpy as np
import pandas as pd
import re
import glob
import gzip
import json
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix, hstack # "horizontal stack"

In [2]:
def load_data(directory):
    bots = []
    humans = []
    folder = ['/bots', '/humans']
    name = '/*.json.gz'
    for f in folder:
        paths = glob.glob(directory + f + name)
        for p in paths:
            with gzip.open(p, 'r') as file:
                for line in file:
                    if f == folder[0]:
                        js = json.loads(line)
                        if 'tweets' in js:
                            bots.append(js)
                    elif f == folder[1]:
                        js = json.loads(line)
                        if 'tweets' in js:
                            humans.append(js)
    df_bots = pd.DataFrame(bots)[['screen_name', 'tweets', 'listed_count']]
    df_bots['label'] = 'bot'
    df_humans = pd.DataFrame(humans)[['screen_name', 'tweets', 'listed_count']]
    df_humans['label'] = 'human'
    frames = [df_bots, df_humans]
    df = pd.concat(frames)
    users = bots + humans
    # tweets_avg_mentions = []
    # tweets_avg_urls = []
    # factor = 100
    tweets_texts = []
    for u in users:
        tweets = u['tweets']  # a list of dicts
        texts = [t['full_text'] for t in tweets]
        tweets_texts.append(str(texts).strip('[]'))
    df['tweets_texts'] = tweets_texts
    return df
df = load_data('/Users/sheepman/Downloads/bots/small')
df

Unnamed: 0,screen_name,tweets,listed_count,label,tweets_texts
0,carlos_eggbot,[{'created_at': 'Sat Jun 01 18:36:07 +0000 201...,0,bot,"'You heard me! Shoot me.', 'Junpei, you...', '..."
1,ecolo_ebooks,[{'created_at': 'Sat Jun 01 18:36:11 +0000 201...,2,bot,"""i'm not straight but 20 bucks is 20 bu"", '""ec..."
2,AllStarSMBot,[{'created_at': 'Sat Jun 01 18:36:28 +0000 201...,3,bot,"""You'll never know if you don't go\nYou'll nev..."
3,saionji_en,[{'created_at': 'Sat Jun 01 18:36:52 +0000 201...,3,bot,"""why the fuck am i banana girl? i'll never die..."
4,KimClune,[{'created_at': 'Sat Jun 01 18:37:20 +0000 201...,329,bot,'Chewing rather than drinking breakfast is AWE...
5,CatsDogsBOT,[{'created_at': 'Sat Jun 01 18:38:10 +0000 201...,3,bot,"'[Discussion] If I say no, that should be it. ..."
6,bluejovanka,[{'created_at': 'Sat Jun 01 18:38:14 +0000 201...,47,bot,"""I'm staying in tonight watching someone with ..."
7,anittavota4,[{'created_at': 'Sat Jun 01 18:39:19 +0000 201...,0,bot,'RT BrettHillOwens2: #PremiosMTVMIAW #MTVBRMUS...
8,justtraveluk,[{'created_at': 'Sat Jun 01 18:39:21 +0000 201...,11,bot,'The Top 5 Airports in the World for Departure...
9,rhaudiencebot,[{'created_at': 'Sat Jun 01 18:40:08 +0000 201...,0,bot,"'GET BUTCH, BITCH!', 'HEY RIFF, WHAT DO YOU DO..."


In [3]:
# what is the distribution over class labels?
df.label.value_counts()
df.dtypes

screen_name     object
tweets          object
listed_count     int64
label           object
tweets_texts    object
dtype: object

In [4]:
def get_tweets_features(texts):
    count_mention = 0
    count_url = 0
    factor = 100
    features = {}
    for s in texts:
        if 'http' in s:
            count_url += 1
        if '@' in s:
            count_mention += 1
    if len(texts) == 0:
        features['tweets_avg_urls'] = 0
        features['tweets_avg_mentions'] = 0
    else:
        features['tweets_avg_urls'] = factor * count_url / len(texts)
        features['tweets_avg_mentions'] = factor * count_mention / len(texts)
    return features

In [5]:
def make_features(df):
    ## Add your code to create features.
    vec = DictVectorizer()
    feature_dicts = []
    for i, row in df.iterrows():
        tweets = row['tweets']
        texts = [t['full_text'] for t in tweets]
        features = get_tweets_features(texts)
        feature_dicts.append(features)
    X = vec.fit_transform(feature_dicts)
    return X, vec

In [6]:
X, dict_vec = make_features(df)
print(dict_vec.vocabulary_)

{'tweets_avg_urls': 1, 'tweets_avg_mentions': 0}


In [7]:
count_vec = CountVectorizer(min_df=0.03, max_df=0.8, ngram_range=(3, 3))
X_words = count_vec.fit_transform(df.tweets_texts)
# get_f(count_vec, X_words)
optimal_X_all = hstack([X, X_words]).tocsr()

In [8]:
# top terms?
def print_top_words(X_words, count_vec, n=10):
    features = count_vec.get_feature_names()
    word_counts = X_words.sum(axis=0).A1
    for i in np.argsort(word_counts)[::-1][:n]:
        print('%20s\t%d' % (features[i], word_counts[i]))

print_top_words(X_words, count_vec)

mtvbrclipebanana mtvbrhinobolarebola mtvbrfeatondadiferente	1400
mtvbrmusicalanitta mtvbrclipebanana mtvbrhinobolarebola	1400
mtvlastoriesanitta mtvlaviralanitta mtvlashiprip	1200
          01 2019 at	661
        june 01 2019	661
       thank you for	482
       here https co	277
       more https co	275
         at https co	270
         to https co	256


In [9]:
# we'll first store the classes separately in a numpy array
y = np.array(df.label)
Counter(y)
optimal_X_all.shape

(200, 1041)

In [10]:
# store the class names
class_names = set(df.label)
# how often does each word appear in each class?
for word, idx in dict_vec.vocabulary_.items():
    for class_name in class_names:
        class_idx = np.where(y==class_name)[0]
        print('%20s\t%20s\t%d' % (word, class_name, X[class_idx, idx].sum()))

     tweets_avg_urls	               human	5491
     tweets_avg_urls	                 bot	3657
 tweets_avg_mentions	               human	6123
 tweets_avg_mentions	                 bot	905


In [11]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

In [14]:
a= [10,50,100,200]
b = [ 0.001, 0.0001, 0.00001]
mean_accuracies = np.zeros((4, 3), dtype=np.float)
for row_idx, hidden_layer_sizes in enumerate(a):
    for col_idx, alpha in enumerate(b):
        print("hidden_layer_sizes=%d, alpha=%f"%(hidden_layer_sizes,alpha))
        clf = MLPClassifier(hidden_layer_sizes = hidden_layer_sizes , activation='relu', solver='adam', alpha=alpha, max_iter=500)
        clf.fit(optimal_X_all, y)
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        accuracies = []
        for train, test in kf.split(optimal_X_all):
            clf.fit(optimal_X_all[train], y[train])
            pred = clf.predict(optimal_X_all[test])
            accuracies.append(accuracy_score(y[test], pred))
            clf.coefs_
        mean_accuracies[row_idx][col_idx] = str(round(np.mean(accuracies), 2))
#         print('accuracy over all cross-validation folds: %s' % str(accuracies))
        print('mean=%.2f std=%.2f\n' % (np.mean(accuracies), np.std(accuracies)))

for i in range(4):
    print("hidden_layer_size=%d\nalpha\t\taccuracy"%a[i])
    row_str = []
    for j in range(3):
        print('%f\t%f'%(b[j],mean_accuracies[i][j]))
    print()

hidden_layer_sizes=10, alpha=0.001000
mean=0.82 std=0.07

hidden_layer_sizes=10, alpha=0.000100
mean=0.81 std=0.03

hidden_layer_sizes=10, alpha=0.000010
mean=0.81 std=0.05

hidden_layer_sizes=50, alpha=0.001000
mean=0.84 std=0.05

hidden_layer_sizes=50, alpha=0.000100
mean=0.80 std=0.05

hidden_layer_sizes=50, alpha=0.000010
mean=0.83 std=0.06

hidden_layer_sizes=100, alpha=0.001000
mean=0.84 std=0.05

hidden_layer_sizes=100, alpha=0.000100
mean=0.86 std=0.05

hidden_layer_sizes=100, alpha=0.000010
mean=0.84 std=0.05

hidden_layer_sizes=200, alpha=0.001000
mean=0.88 std=0.06

hidden_layer_sizes=200, alpha=0.000100
mean=0.85 std=0.07

hidden_layer_sizes=200, alpha=0.000010
mean=0.84 std=0.05

hidden_layer_size=10
alpha		accuracy
0.001000	0.820000
0.000100	0.810000
0.000010	0.800000

hidden_layer_size=50
alpha		accuracy
0.001000	0.840000
0.000100	0.800000
0.000010	0.830000

hidden_layer_size=100
alpha		accuracy
0.001000	0.840000
0.000100	0.860000
0.000010	0.840000

hidden_layer_size=200

In [15]:
from sklearn.ensemble import RandomForestClassifier
c= [1, 3, 5]
d = [100, 200, 300]
mean_accuracies = np.zeros((3, 3), dtype=np.float)
for row_idx, min_samples_leaf in enumerate(c):
    for col_idx, n_estimators in enumerate(d):
        print("min_samples_leaf=%d, n_estimators=%d"%(min_samples_leaf,n_estimators))
        rand = RandomForestClassifier(n_estimators=n_estimators, min_samples_leaf= min_samples_leaf)
        rand.fit(optimal_X_all, y)
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        accuracies = []
        for train, test in kf.split(optimal_X_all):
            rand.fit(optimal_X_all[train], y[train])
            pred = rand.predict(optimal_X_all[test])
            accuracies.append(accuracy_score(y[test], pred))
#         print(rand.feature_importances_)
        mean_accuracies[row_idx][col_idx] = str(round(np.mean(accuracies), 2))
#         print('accuracy over all cross-validation folds: %s' % str(accuracies))
        print('mean=%.2f std=%.2f\n' % (np.mean(accuracies), np.std(accuracies)))

for i in range(3):
    print("min_samples_leaf=%d\nn_estimators\taccuracy"%c[i])
    row_str = []
    for j in range(3):
        print('%f\t%f'%(d[j],mean_accuracies[i][j]))
    print()

min_samples_leaf=1, n_estimators=100
mean=0.88 std=0.04

min_samples_leaf=1, n_estimators=200
mean=0.88 std=0.05

min_samples_leaf=1, n_estimators=300
mean=0.89 std=0.05

min_samples_leaf=3, n_estimators=100
mean=0.88 std=0.03

min_samples_leaf=3, n_estimators=200
mean=0.89 std=0.04

min_samples_leaf=3, n_estimators=300
mean=0.88 std=0.02

min_samples_leaf=5, n_estimators=100
mean=0.87 std=0.03

min_samples_leaf=5, n_estimators=200
mean=0.88 std=0.03

min_samples_leaf=5, n_estimators=300
mean=0.88 std=0.05

min_samples_leaf=1
n_estimators	accuracy
100.000000	0.880000
200.000000	0.880000
300.000000	0.890000

min_samples_leaf=3
n_estimators	accuracy
100.000000	0.880000
200.000000	0.880000
300.000000	0.880000

min_samples_leaf=5
n_estimators	accuracy
100.000000	0.870000
200.000000	0.880000
300.000000	0.880000

