In [1]:
import pandas as pd
import sklearn
from scipy.stats import randint, uniform

tweet_features_df = pd.read_json('../dataset/tweet-vectorised.json', orient='index').drop(['inferred_text', 'user_name', 'possibly_sensitive', 'user_location', 'user_description', 'user_created_at'], axis=1)
tweet_features_df.head()

Unnamed: 0,is_quote_status,retweet_count,favorite_count,is-spam,user_followers_count,user_friends_count,user_listed_count,user_favourites_count,user_geo_enabled,user_verified,...,vec_40,vec_41,vec_42,vec_43,vec_44,vec_45,vec_46,vec_47,vec_48,vec_49
0,False,247,0,0,1452,1085,14,40794,True,False,...,0.466278,0.067953,-0.251913,0.106986,1.408329,0.461382,-0.314719,0.043074,0.457783,0.228383
1,False,2543,0,0,176,1297,0,7620,False,False,...,0.114854,0.012156,-0.051351,0.009402,0.352569,0.109051,-0.090185,0.026282,0.104716,0.06348
2,False,816,0,0,48,130,0,8648,False,False,...,0.500366,0.1091,-0.130674,0.052167,1.606052,0.420022,-0.39277,0.070301,0.437388,0.295437
3,False,2687,0,0,21,102,1,16040,False,False,...,0.160209,0.016627,-0.045942,0.034811,0.492251,0.14409,-0.115983,0.030251,0.159176,0.100163
4,False,0,0,0,155,401,3,8729,False,False,...,0.230221,0.037889,-0.066517,0.031774,0.692398,0.203983,-0.149367,0.017939,0.19504,0.117355


In [2]:
TRAIN_SPLIT = 0.8

sample = tweet_features_df.sample(frac=1.0, random_state=42)
predictors_train = sample.drop('is-spam', axis=1).iloc[:int(len(sample) * TRAIN_SPLIT)]
predictors_valid = sample.drop('is-spam', axis=1).iloc[int(len(sample) * TRAIN_SPLIT):]
target_train = sample['is-spam'].iloc[:int(len(sample) * TRAIN_SPLIT)]
target_valid = sample['is-spam'].iloc[int(len(sample) * TRAIN_SPLIT):]

In [3]:
predictors_valid

Unnamed: 0,is_quote_status,retweet_count,favorite_count,user_followers_count,user_friends_count,user_listed_count,user_favourites_count,user_geo_enabled,user_verified,user_statuses_count,...,vec_40,vec_41,vec_42,vec_43,vec_44,vec_45,vec_46,vec_47,vec_48,vec_49
1549,False,0,0,692,296,2,20750,True,False,102823,...,0.543487,0.122533,-0.132868,0.071706,1.787908,0.484664,-0.457557,0.083385,0.475254,0.314095
1256,False,1,0,1236,1350,10,38536,True,False,32183,...,0.450330,0.136012,-0.075901,0.023585,1.603727,0.394888,-0.437601,0.124210,0.392712,0.302579
2094,False,0,0,239,354,4,4695,False,False,6018,...,0.526300,0.037698,-0.113256,0.038169,1.543225,0.322131,-0.305236,0.043952,0.456495,0.324178
1555,False,0,0,7,18,0,226,False,False,243,...,0.263634,0.044343,-0.065287,0.010375,0.840499,0.222799,-0.188154,0.044243,0.223314,0.169943
3621,False,18,0,150,1135,12,2555,True,False,155341,...,0.230466,0.057927,-0.065660,0.019738,0.720248,0.190735,-0.181772,0.010237,0.199209,0.121725
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3444,False,0,7,863,559,29,13193,True,False,8694,...,0.221837,0.072342,-0.072447,0.028501,0.786127,0.230351,-0.235719,0.037302,0.166300,0.128002
466,False,12,0,369,368,4,4837,False,False,1109,...,0.230444,0.045388,-0.068571,0.034979,0.735764,0.201392,-0.193078,0.033807,0.206101,0.144451
3092,False,0,1,345,237,0,54,False,False,554,...,0.356836,0.078983,-0.106835,0.040553,1.176486,0.337114,-0.320365,0.044157,0.298037,0.216625
3772,False,0,0,870,240,44,34842,False,False,21394,...,0.332828,-0.031062,-0.155249,0.128149,1.040161,0.231927,-0.175956,0.092493,0.457714,0.230146


In [26]:
parameters = {
    'n_estimators': [1, 2, 5, 10, 25, 50, 100, 250, 500],
    'criterion': ['gini', 'entropy'],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False],
    'class_weight': ['balanced', 'balanced_subsample']
}

from sklearn.ensemble import RandomForestClassifier

_model_rf = RandomForestClassifier(n_jobs=-1, random_state=42)
grid_search_rf = sklearn.model_selection.GridSearchCV(_model_rf, parameters)

grid_search_rf.fit(predictors_train, target_train)

GridSearchCV(estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
             param_grid={'bootstrap': [True, False],
                         'class_weight': ['balanced', 'balanced_subsample'],
                         'criterion': ['gini', 'entropy'],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [1, 2, 5, 10, 25, 50, 100, 250, 500]})

In [27]:
grid_search_rf.best_params_

{'bootstrap': False,
 'class_weight': 'balanced',
 'criterion': 'gini',
 'max_features': 'auto',
 'n_estimators': 25}

In [28]:
grid_search_rf.predict(predictors_valid)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [29]:
sklearn.metrics.accuracy_score(target_valid, grid_search_rf.predict(predictors_valid))

0.9807692307692307

Conclusion: the above parameters are the best for our use case. As per what have been demonstrated in Orange, this is the highest model, with XGB performing on 0.975 at highest (of which the results were deleted due to BSOD sadly) and SVM performing on the range of 97.1, this is ideal for us.

In [4]:
# Loading the model.
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier(n_jobs=-1, random_state=42, bootstrap=False, class_weight='balanced', criterion='gini', max_features='auto', n_estimators=25)
model_rf.fit(predictors_train, target_train)

RandomForestClassifier(bootstrap=False, class_weight='balanced',
                       n_estimators=25, n_jobs=-1, random_state=42)

In [5]:
sklearn.metrics.accuracy_score(target_valid, model_rf.predict(predictors_valid))

0.9807692307692307

In [6]:
import pickle
with open('../model/model_rf.pickle', 'wb') as f:
    pickle.dump(model_rf, f)

In [16]:
sample.iloc[:200].index

Int64Index([2481,  308, 2414, 2720,  463, 1185,   96, 2031, 1886,  843,
            ...
            1926, 2856, 3931, 1736, 1320, 1803, 2259, 1961, 1965, 3868],
           dtype='int64', length=200)

In [20]:
tweets_df.loc[sample.iloc[:200].index].to_excel('../dataset/tweet-sample.xlsx')