In [8]:
import pandas as pd
import sklearn
from scipy.stats import randint, uniform

tweet_features_df = pd.read_json('../dataset/tweet-vectorised.json', orient='index').drop(['inferred_text', 'user_name', 'possibly_sensitive', 'user_location', 'user_description', 'user_created_at'], axis=1)
tweet_features_df.head()

Unnamed: 0,is_quote_status,retweet_count,favorite_count,is-spam,user_followers_count,user_friends_count,user_listed_count,user_favourites_count,user_geo_enabled,user_verified,...,vec_40,vec_41,vec_42,vec_43,vec_44,vec_45,vec_46,vec_47,vec_48,vec_49
0,False,247,0,0,1452,1085,14,40794,True,False,...,0.466278,0.067953,-0.251913,0.106986,1.408329,0.461382,-0.314719,0.043074,0.457783,0.228383
1,False,2543,0,0,176,1297,0,7620,False,False,...,0.114854,0.012156,-0.051351,0.009402,0.352569,0.109051,-0.090185,0.026282,0.104716,0.06348
2,False,816,0,0,48,130,0,8648,False,False,...,0.500366,0.1091,-0.130674,0.052167,1.606052,0.420022,-0.39277,0.070301,0.437388,0.295437
3,False,2687,0,0,21,102,1,16040,False,False,...,0.160209,0.016627,-0.045942,0.034811,0.492251,0.14409,-0.115983,0.030251,0.159176,0.100163
4,False,0,0,0,155,401,3,8729,False,False,...,0.230221,0.037889,-0.066517,0.031774,0.692398,0.203983,-0.149367,0.017939,0.19504,0.117355


In [25]:
TRAIN_SPLIT = 0.8

sample = tweet_features_df.sample(frac=1.0, random_state=42)
predictors_train = sample.drop('is-spam', axis=1).iloc[:int(len(sample) * TRAIN_SPLIT)]
predictors_valid = sample.drop('is-spam', axis=1).iloc[int(len(sample) * TRAIN_SPLIT):]
target_train = sample['is-spam'].iloc[:int(len(sample) * TRAIN_SPLIT)]
target_valid = sample['is-spam'].iloc[int(len(sample) * TRAIN_SPLIT):]

In [11]:
predictors_valid

Unnamed: 0,is_quote_status,retweet_count,favorite_count,user_followers_count,user_friends_count,user_listed_count,user_favourites_count,user_geo_enabled,user_verified,user_statuses_count,...,vec_40,vec_41,vec_42,vec_43,vec_44,vec_45,vec_46,vec_47,vec_48,vec_49
2481,False,0,0,3754,1603,43,2367,False,False,38064,...,0.730406,-0.062034,-0.087188,-0.023475,1.615328,0.123264,-0.217048,-0.040272,0.518828,0.504778
308,False,0,1,99,100,1,3330,False,False,1087,...,0.420321,0.077871,-0.093500,0.065404,1.332428,0.373594,-0.311858,0.049198,0.361423,0.235732
2414,False,16,0,3275,4621,8,60518,True,False,30937,...,1.003360,-0.120500,-0.117903,-0.120431,2.032404,0.179862,-0.165535,-0.108204,0.800850,0.567319
2720,False,0,0,450,426,5,7685,True,False,2992,...,0.714012,0.190532,-0.211104,0.084997,2.471956,0.701526,-0.639765,0.107569,0.624439,0.451710
463,True,1,0,810,316,2,3041,False,False,1036,...,0.128209,0.024969,-0.056590,0.022643,0.412362,0.123325,-0.096041,0.026945,0.109482,0.069944
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3018,False,0,16,3530,416,5,7082,False,False,6492,...,0.207023,0.091418,-0.078592,0.002897,0.770543,0.224006,-0.275655,0.028913,0.134709,0.134510
470,False,3902,0,282,599,5,40702,False,False,51871,...,0.092584,0.012619,-0.031853,0.010847,0.283162,0.082167,-0.068727,0.013897,0.083990,0.050564
3971,False,0,0,6,51,0,404,False,False,1665,...,0.790218,0.070981,-0.191493,0.122514,2.351590,0.609034,-0.532824,0.111865,0.726020,0.435849
1396,False,0,0,126,272,1,7374,False,False,4293,...,0.451121,0.042677,-0.125943,0.046147,1.385850,0.352104,-0.328679,0.032158,0.417157,0.264050


In [26]:
parameters = {
    'n_estimators': [1, 2, 5, 10, 25, 50, 100, 250, 500],
    'criterion': ['gini', 'entropy'],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False],
    'class_weight': ['balanced', 'balanced_subsample']
}

from sklearn.ensemble import RandomForestClassifier

_model_rf = RandomForestClassifier(n_jobs=-1, random_state=42)
grid_search_rf = sklearn.model_selection.GridSearchCV(_model_rf, parameters)

grid_search_rf.fit(predictors_train, target_train)

GridSearchCV(estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
             param_grid={'bootstrap': [True, False],
                         'class_weight': ['balanced', 'balanced_subsample'],
                         'criterion': ['gini', 'entropy'],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [1, 2, 5, 10, 25, 50, 100, 250, 500]})

In [27]:
grid_search_rf.best_params_

{'bootstrap': False,
 'class_weight': 'balanced',
 'criterion': 'gini',
 'max_features': 'auto',
 'n_estimators': 25}

In [28]:
grid_search_rf.predict(predictors_valid)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [29]:
sklearn.metrics.accuracy_score(target_valid, grid_search_rf.predict(predictors_valid))

0.9807692307692307

Conclusion: the above parameters are the best for our use case. As per what have been demonstrated in Orange, this is the highest model, with XGB performing on 0.975 at highest (of which the results were deleted due to BSOD sadly) and SVM performing on the range of 97.1, this is ideal for us.

In [32]:
# Loading the model.
model_rf = RandomForestClassifier(n_jobs=-1, random_state=42, bootstrap=False, class_weight='balanced', criterion='gini', max_features='auto', n_estimators=25)
model_rf.fit(predictors_train, target_train)

RandomForestClassifier(bootstrap=False, class_weight='balanced',
                       n_estimators=25, n_jobs=-1, random_state=42)

In [33]:
sklearn.metrics.accuracy_score(target_valid, model_rf.predict(predictors_valid))

0.9807692307692307

In [35]:
import pickle
with open('../model/model_rf.pickle', 'wb') as f:
    pickle.dump(model_rf, f)