In [32]:
from collections import Counter
import numpy as np
import pandas as pd
import re
import glob
import gzip
import json
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix, hstack # "horizontal stack"

In [33]:
def load_data(directory):
    bots = []
    humans = []
    folder = ['/bots', '/humans']
    name = '/*.json.gz'
    for f in folder:
        paths = glob.glob(directory + f + name)
        for p in paths:
            with gzip.open(p, 'r') as file:
                for line in file:
                    if f == folder[0]:
                        js = json.loads(line)
                        if 'tweets' in js:
                            bots.append(js)
                    elif f == folder[1]:
                        js = json.loads(line)
                        if 'tweets' in js:
                            humans.append(js)
    df_bots = pd.DataFrame(bots)[['screen_name', 'tweets', 'listed_count']]
    df_bots['label'] = 'bot'
    df_humans = pd.DataFrame(humans)[['screen_name', 'tweets', 'listed_count']]
    df_humans['label'] = 'human'
    frames = [df_bots, df_humans]
    df = pd.concat(frames)
    users = bots + humans
    # tweets_avg_mentions = []
    # tweets_avg_urls = []
    # factor = 100
    tweets_texts = []
    for u in users:
        tweets = u['tweets']  # a list of dicts
        texts = [t['full_text'] for t in tweets]
        tweets_texts.append(str(texts).strip('[]'))
    df['tweets_texts'] = tweets_texts
    return df
df = load_data('/Users/lcj/small')
df

Unnamed: 0,screen_name,tweets,listed_count,label,tweets_texts
0,carlos_eggbot,[{'created_at': 'Sat Jun 01 18:36:07 +0000 201...,0,bot,"'You heard me! Shoot me.', 'Junpei, you...', '..."
1,ecolo_ebooks,[{'created_at': 'Sat Jun 01 18:36:11 +0000 201...,2,bot,"""i'm not straight but 20 bucks is 20 bu"", '""ec..."
2,AllStarSMBot,[{'created_at': 'Sat Jun 01 18:36:28 +0000 201...,3,bot,"""You'll never know if you don't go\nYou'll nev..."
3,saionji_en,[{'created_at': 'Sat Jun 01 18:36:52 +0000 201...,3,bot,"""why the fuck am i banana girl? i'll never die..."
4,KimClune,[{'created_at': 'Sat Jun 01 18:37:20 +0000 201...,329,bot,'Chewing rather than drinking breakfast is AWE...
5,CatsDogsBOT,[{'created_at': 'Sat Jun 01 18:38:10 +0000 201...,3,bot,"'[Discussion] If I say no, that should be it. ..."
6,bluejovanka,[{'created_at': 'Sat Jun 01 18:38:14 +0000 201...,47,bot,"""I'm staying in tonight watching someone with ..."
7,anittavota4,[{'created_at': 'Sat Jun 01 18:39:19 +0000 201...,0,bot,'RT BrettHillOwens2: #PremiosMTVMIAW #MTVBRMUS...
8,justtraveluk,[{'created_at': 'Sat Jun 01 18:39:21 +0000 201...,11,bot,'The Top 5 Airports in the World for Departure...
9,rhaudiencebot,[{'created_at': 'Sat Jun 01 18:40:08 +0000 201...,0,bot,"'GET BUTCH, BITCH!', 'HEY RIFF, WHAT DO YOU DO..."


In [34]:
# what is the distribution over class labels?
df.label.value_counts()
df.dtypes

screen_name     object
tweets          object
listed_count     int64
label           object
tweets_texts    object
dtype: object

In [35]:
def get_tweets_features(texts):
    count_mention = 0
    count_url = 0
    factor = 100
    features = {}
    for s in texts:
        if 'http' in s:
            count_url += 1
        if '@' in s:
            count_mention += 1
    if len(texts) == 0:
        features['tweets_avg_urls'] = 0
        features['tweets_avg_mentions'] = 0
    else:
        features['tweets_avg_urls'] = factor * count_url / len(texts)
        features['tweets_avg_mentions'] = factor * count_mention / len(texts)
    return features

In [36]:
def make_features(df):
    ## Add your code to create features.
    vec = DictVectorizer()
    feature_dicts = []
    for i, row in df.iterrows():
        tweets = row['tweets']
        texts = [t['full_text'] for t in tweets]
        features = get_tweets_features(texts)
        feature_dicts.append(features)
    X = vec.fit_transform(feature_dicts)
    return X, vec

In [37]:
X, dict_vec = make_features(df)
print(dict_vec.vocabulary_)

{'tweets_avg_urls': 1, 'tweets_avg_mentions': 0}


In [38]:
count_vec = CountVectorizer(min_df=0.03, max_df=0.8, ngram_range=(3, 3))
X_words = count_vec.fit_transform(df.tweets_texts)
# get_f(count_vec, X_words)
optimal_X_all = hstack([X, X_words]).tocsr()
# print(type(X_words))

In [39]:
# top terms?
def print_top_words(X_words, count_vec, n=10):
    features = count_vec.get_feature_names()
    word_counts = X_words.sum(axis=0).A1
    for i in np.argsort(word_counts)[::-1][:n]:
        print('%20s\t%d' % (features[i], word_counts[i]))

print_top_words(X_words, count_vec)

mtvbrclipebanana mtvbrhinobolarebola mtvbrfeatondadiferente	1400
mtvbrmusicalanitta mtvbrclipebanana mtvbrhinobolarebola	1400
mtvlastoriesanitta mtvlaviralanitta mtvlashiprip	1200
          01 2019 at	661
        june 01 2019	661
       thank you for	482
       here https co	277
       more https co	275
         at https co	270
         to https co	256


In [40]:
# we'll first store the classes separately in a numpy array
y = np.array(df.label)
Counter(y)
optimal_X_all.shape

(200, 1041)

In [41]:
# store the class names
class_names = set(df.label)
# how often does each word appear in each class?
for word, idx in dict_vec.vocabulary_.items():
    for class_name in class_names:
        class_idx = np.where(y==class_name)[0]
        print('%20s\t%20s\t%d' % (word, class_name, X[class_idx, idx].sum()))

     tweets_avg_urls	                 bot	3657
     tweets_avg_urls	               human	5491
 tweets_avg_mentions	                 bot	905
 tweets_avg_mentions	               human	6123


In [42]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

In [43]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.python.keras.layers import Dropout, Flatten

In [44]:
from tensorflow.python.keras.layers import Dropout, Flatten
vocab_size = 10000
dropout_rate = .2
model = keras.Sequential()
# model.add(keras.layers.Embedding(vocab_size, 16))
# model.add(Dropout(rate=dropout_rate))
# model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16,input_shape=(1041,)))
model.add(keras.layers.Dense(16, activation='relu'))
model.add(Dropout(rate=dropout_rate))
model.add(keras.layers.Dense(1, activation='sigmoid'))
#model.build((200,1041))
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 16)                16672     
_________________________________________________________________
dense_7 (Dense)              (None, 16)                272       
_________________________________________________________________
dropout_2 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 17        
Total params: 16,961
Trainable params: 16,961
Non-trainable params: 0
_________________________________________________________________


In [45]:
adam = keras.optimizers.Adam(lr=0.001)
model.compile(optimizer=adam,
                  loss='binary_crossentropy',
                  metrics=['accuracy'])


In [46]:
from sklearn.utils import shuffle

In [47]:
optimal_X_all, y =shuffle(optimal_X_all,y)

In [48]:
y = np.array([1 if i == 'human' else 0 for i in y])

In [49]:
from scipy.sparse import csr_matrix
from scipy import *
# csr_matrix((3, 4), dtype=np.int8).toarray()
#arr = sparse.lil_matrix(optimal_X_all).toarray()

x_val = optimal_X_all[:20]
partial_x_train = optimal_X_all[20:]

y_val = y[:20]
partial_y_train = y[20:]

print(type(partial_x_train))
print(type(y_val))

<class 'scipy.sparse.csr.csr_matrix'>
<class 'numpy.ndarray'>


In [50]:
history = model.fit(partial_x_train.todense(),
                    partial_y_train,
                    epochs=100,
                    batch_size=512,
                    validation_data=(x_val.todense(), y_val),
                    verbose=1)


Train on 100 samples, validate on 100 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
