In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *
import utils
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 800
import numpy as np
import re
from collections import defaultdict
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
import plotly.plotly as py
import plotly.graph_objs as go
import plotly
plotly.offline.init_notebook_mode(connected=True)
from keras.models import Sequential, optimizers
from keras.layers import Dense
from keras_tqdm import TQDMNotebookCallback
from keras.layers import Embedding
from keras.layers import LSTM
import importlib
import utils
import pickle

### Load data

In [2]:
data = pd.read_csv('../data/tweets/tweets.csv', low_memory=False, header=0, encoding = "ISO-8859-1")
data['BotType']=data['BotType'].apply(lambda x:0 if x== 'Traditional' else 1)

In [3]:
importlib.reload(utils)
data.loc[:,'text'] = utils.clean_tweets(data.loc[:,'text'])

### Mitigate class imbalance

In [4]:
# # Undersample traditional bots
a = data[data['BotType']==0].sample(frac=.2)
b = data[data['BotType']==1]
data  = pd.concat([a,b],axis=0)

### Train-test split

In [7]:
data.head()

Unnamed: 0,text,BotType
124909,"@steehjns vi e fiquei babando, morri kkkkkkk podia ter de verdade, impresso ): ia fazer sucesso!",0
101652,seguindo \u2014 - http://4ms.me/bKWf23,0
4907,"New post, ""Delta to Sell Piceance Properties"" - http://bit.ly/8Zg50o",0
6890,"Emerging-Market Shares Advance for 4th Day on Earnings, Greece - BusinessWeek http://tinyurl.com/2wn895j",0
38810,@amanda_jg *---* oii floor :D,0


In [8]:
# x_train, x_test, y_train, y_test = train_test_split(data.loc[:,'text'], data['BotType'], test_size=0.2, random_state=321)
x_train = data.loc[:,'text']
y_train = data.loc[:,'BotType']

### tf-idf conversion

In [9]:
tfidf = TfidfVectorizer(stop_words='english', token_pattern='[\w]+', ngram_range=(1,2))
x_train = tfidf.fit_transform(x_train)
# x_test = tfidf.transform(x_test)

In [10]:
clf_choice = 'mlp'

### classifier

In [11]:
if clf_choice == 'xgb':
    # xgboost classifier
    clf = XGBClassifier(n_jobs=4)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
elif clf_choice == 'mlp':
    # Simply multilayer perceptron
    # create model
    model = Sequential()
    model.add(Dense(100, input_dim=x_train.shape[1], activation='relu'))
#     model.add(Dense(20, activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # Fit the model
    model.fit(x_train, y_train, epochs=5, batch_size=20, verbose=0, callbacks=[TQDMNotebookCallback()])
    # predict the test set
#     y_pred = model.predict_classes(x_test)
elif clf_choice == 'lstm':
    # LSTM implementation
    model = Sequential()
    model.add(Embedding(input_dim=x_train.shape[1], output_dim=32))
    model.add(LSTM(16, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    opt_params = optimizers.RMSprop(lr=0.01)
    model.compile(loss='binary_crossentropy', optimizer=opt_params, metrics=['accuracy'])
    model.fit(x_train, y_train, epochs=2, batch_size=100, verbose=0, callbacks=[TQDMNotebookCallback()])
#     y_pred = model.predict_classes(x_test)
    
    
# print(classification_report(y_test, y_pred))
# confusion_matrix(y_test, y_pred)

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget




ValueError: Error when checking : expected dense_1_input to have shape (198299,) but got array with shape (1,)

In [20]:
type(model)

keras.models.Sequential

In [18]:
pickle.dump(tfidf, open('tfidf.dat', 'wb'))

TypeError: can't pickle _thread.lock objects

In [21]:
from keras.models import load_model
model.save('mlp.h5')

In [22]:
test = load_model('mlp.h5')

In [24]:
test.predict_classes(x_train)

array([[0],
       [0],
       [0],
       ..., 
       [1],
       [1],
       [1]], dtype=int32)

### Feature importance

In [31]:
if clf_choice == 'xgb':
    idx = np.argsort(clf.feature_importances_)[::-1]
    most_freq_words = [tfidf.get_feature_names()[x] for x in idx[:100]]
    trace = go.Scatter(x= most_freq_words, y=np.sort(clf.feature_importances_)[::-1][:50])
    layout = dict(title='Feature Importance', xaxis=dict(title='Top 100 Words'), yaxis=dict(title='Feature Weights'))
    fig = dict(data=[trace], layout=layout)
    plotly.offline.iplot(fig)

In [None]:
# clf.predict_proba(x_test)

### Code that does word2vec, did not work well...

In [None]:
# model = gensim.models.Word2Vec(x_train, workers=4, size=300)
# w2v = dict(zip(model.wv.index2word, model.wv.vectors))
# temp = MeanEmbeddingVectorizer(w2v)
# temp.fit(x_train)
# x_train = temp.transform(x_train)
# x_test = temp.transform(x_test)

### Oversampling technique, did not do as well as undersampling

In [None]:
# x_train, y_train = RandomOverSampler().fit_sample(x_train, y_train)