## Intro

In [40]:
import numpy as np # for multi-dimensional array operations
import pandas as pd # for reading data from .csv files
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.decomposition import PCA # for principle component analysis (dimensionality reduction)
from sklearn.model_selection import train_test_split # for splitting the dataset into training and testing sets
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RandomizedSearchCV # for getting the best hyper parameters
from sklearn.preprocessing import MinMaxScaler # for scaling of data before PCAfrom sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score #f1 score
from sklearn.decomposition import TruncatedSVD

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [3]:
train_set = pd.read_csv('../Training and Testing sets/train.csv') # import the training set
test_set = pd.read_csv('../Training and Testing sets/test.csv') # import the testing set

print(train_set.head(10))
print(test_set.head(10))


   id                                               post  label
0   1  not surprised liberals islamists biggest threa...      1
1   2  liam neeson narnias aslan the lion could be mu...      0
2   3  ur right its simple islam is not part of our c...      1
3   4  except we dont behead queers and rape women un...      0
4   5  pastors take note white architect designed you...      1
5   6  im sure everyone is going to tell you how brav...      0
6   7  its ludicrous youre hate can only produce one ...      0
7   8  that is insane every mosque ought to be bacona...      1
8   9  What two kinds of people are totally different...      1
9  10  proletariat brown people bourgeoisie white people      1
      id                                               post
0  17185  i had some boomer cuck tell me take that pic d...
1  17186  life of indian pm not his private choice to be...
2  17187  diversity is only imposed on white nations div...
3  17188  they should be in charge of their own people i

## Vectorise the words not as one by one

In [13]:
train_set_label = train_set.loc[:, ["label"]]
train_words = train_set.drop(['id','label'], axis =1) # train_set_features will not contain the label and id columns
test_words = test_set.drop(['id'], axis =1)

print(train_words.head(10))
print(test_words.head(10))

                                                post
0  not surprised liberals islamists biggest threa...
1  liam neeson narnias aslan the lion could be mu...
2  ur right its simple islam is not part of our c...
3  except we dont behead queers and rape women un...
4  pastors take note white architect designed you...
5  im sure everyone is going to tell you how brav...
6  its ludicrous youre hate can only produce one ...
7  that is insane every mosque ought to be bacona...
8  What two kinds of people are totally different...
9  proletariat brown people bourgeoisie white people
                                                post
0  i had some boomer cuck tell me take that pic d...
1  life of indian pm not his private choice to be...
2  diversity is only imposed on white nations div...
3  they should be in charge of their own people i...
4  white supremacists were only strong in the dem...
5  Walks into a post office to buy stamps Me Hmon...
6  seriously he fought for human rights black 

Combine train and test

In [51]:
frames = [train_words,test_set]
to_vector = pd.concat(frames)
print(to_vector.shape)
print(to_vector.head(5))

(21480, 2)
                                                post  id
0  not surprised liberals islamists biggest threa... NaN
1  liam neeson narnias aslan the lion could be mu... NaN
2  ur right its simple islam is not part of our c... NaN
3  except we dont behead queers and rape women un... NaN
4  pastors take note white architect designed you... NaN


In [26]:
vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2))
to_reduce = vectorizer2.fit_transform(to_vector['post'])
vectorizer2.get_feature_names_out()
print(type(to_reduce))
print(to_reduce.shape)

<class 'scipy.sparse._csr.csr_matrix'>
(21480, 151265)


In [54]:
vectorizer3 = TfidfVectorizer(analyzer='word', ngram_range=[1,2])
to_reduce = vectorizer3.fit_transform(to_vector['post'])
vectorizer3.get_feature_names_out()
print(to_reduce.shape)


(21480, 174554)


## SVD (4500)
### Used to train: X_train, Y_train 
### Used to test: X_test, Y_test (if model works well)
Note: Validation test set separation already done in training

Perform PCA on train set features and separate into x_train and y_train data

Only explains 55%

In [56]:
# perform SVD (akin to SVD on sparse matrix)
svd = TruncatedSVD(n_components=4500, n_iter=7, random_state=42)
traintest_reduced = svd.fit_transform(to_reduce)
print(type(traintest_reduced))
traintest_reduced = pd.DataFrame(data = traintest_reduced)
traintest_reduced.shape

<class 'numpy.ndarray'>


(21480, 4500)

In [58]:
print(svd.explained_variance_ratio_.sum())  

0.5081093632987366


In [59]:
X = traintest_reduced.iloc[0:17184,:]
Y = train_set_label
submit_set_reduced = traintest_reduced.iloc[17184:,:]
print(X.shape)
print(Y.shape)
print(submit_set_reduced.shape)

(17184, 4500)
(17184, 1)
(4296, 4500)


In [60]:
# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.15, random_state=41)
# print(X_train.shape)
# print(X_test.shape)

## Random Forest Rough Tuning

4500 features SVD 

{'n_estimators': [10, 580, 1150, 1720, 2290, 2860, 3430, 4000], 'max_features': ['auto', 'sqrt'], 'max_depth': [100, 142, 185, 228, 271, 314, 357, 400], 'bootstrap': [True, False]}


3 kfolds

Results on X_test:

Best:


In [32]:
# Parameters
# Number of tree in random forest 
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 4000, num =4)]
# Number of features to consider at every split
max_features = ['auto','sqrt']
#Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(start = 100, stop = 400, num = 4)]
# Method of selecting samples for training each tree
bootstrap = [True,False]

# Creating a param grid
random_grid = {'n_estimators':n_estimators,
            'max_features':max_features,
            'max_depth': max_depth,
            'bootstrap': bootstrap }

print(random_grid)

{'n_estimators': [10, 1340, 2670, 4000], 'max_features': ['auto', 'sqrt'], 'max_depth': [100, 200, 300, 400], 'bootstrap': [True, False]}


In [30]:
kfold = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 0) # for 3-fold cross validation

In [33]:
clf = RandomForestClassifier()
grid = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, scoring = 'f1', refit = 'accuracy', n_jobs = -1 , cv = kfold, verbose = 2)
grid.fit(X, np.ravel(Y))
print(grid.best_params_) # gets the best hyper-parameters for random forest 

Fitting 3 folds for each of 10 candidates, totalling 30 fits


  warn(
  warn(
  warn(


[CV] END bootstrap=True, max_depth=100, max_features=sqrt, n_estimators=1340; total time=33.2min
[CV] END bootstrap=True, max_depth=100, max_features=sqrt, n_estimators=1340; total time=33.3min


  warn(
  warn(


[CV] END bootstrap=False, max_depth=300, max_features=auto, n_estimators=10; total time=  31.2s
[CV] END bootstrap=False, max_depth=300, max_features=auto, n_estimators=10; total time=  26.4s


  warn(
  warn(


[CV] END bootstrap=False, max_depth=300, max_features=auto, n_estimators=10; total time=  25.2s


  warn(


[CV] END bootstrap=True, max_depth=100, max_features=sqrt, n_estimators=1340; total time=34.3min


  warn(


[CV] END bootstrap=False, max_depth=100, max_features=auto, n_estimators=1340; total time=55.5min
[CV] END bootstrap=False, max_depth=100, max_features=auto, n_estimators=1340; total time=55.6min
[CV] END bootstrap=False, max_depth=100, max_features=auto, n_estimators=1340; total time=58.1min
[CV] END bootstrap=True, max_depth=400, max_features=sqrt, n_estimators=4000; total time=102.3min
[CV] END bootstrap=True, max_depth=400, max_features=sqrt, n_estimators=4000; total time=102.3min


  warn(
  warn(


[CV] END bootstrap=True, max_depth=400, max_features=sqrt, n_estimators=4000; total time=106.1min


  warn(


[CV] END bootstrap=False, max_depth=300, max_features=auto, n_estimators=2670; total time=115.5min
[CV] END bootstrap=False, max_depth=400, max_features=sqrt, n_estimators=2670; total time=115.7min
[CV] END bootstrap=False, max_depth=400, max_features=sqrt, n_estimators=2670; total time=116.1min
[CV] END bootstrap=False, max_depth=300, max_features=auto, n_estimators=2670; total time=116.5min
[CV] END bootstrap=False, max_depth=300, max_features=auto, n_estimators=2670; total time=122.0min
[CV] END bootstrap=False, max_depth=400, max_features=sqrt, n_estimators=2670; total time=122.3min
[CV] END bootstrap=True, max_depth=200, max_features=sqrt, n_estimators=2670; total time=88.6min
[CV] END bootstrap=True, max_depth=200, max_features=sqrt, n_estimators=2670; total time=90.3min
[CV] END bootstrap=True, max_depth=200, max_features=sqrt, n_estimators=2670; total time=88.1min
[CV] END bootstrap=False, max_depth=100, max_features=auto, n_estimators=2670; total time=124.5min
[CV] END bootstr

  warn(


{'n_estimators': 10, 'max_features': 'auto', 'max_depth': 300, 'bootstrap': False}


In [None]:
# grid_prediction = grid.predict(X_train)
# score = f1_score(Y_train, grid_prediction)
# print(score)

## Random Forest Fine Tuning

4500 features SVD 

??????


3 kfolds

Results on X_test:

Best: {'n_estimators': 5, 'max_depth': 350}


In [34]:
# Parameters
# Number of tree in random forest 
n_estimators = [int(x) for x in np.linspace(start = 5, stop = 1000, num =5)]

#Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(start = 250, stop = 350, num = 5)]

# Creating a param grid
random_grid = {'n_estimators':n_estimators,
            'max_depth': max_depth}

print(random_grid)

{'n_estimators': [5, 253, 502, 751, 1000], 'max_depth': [250, 275, 300, 325, 350]}


In [35]:
kfold = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 0) # for 3-fold cross validation

In [36]:
clf = RandomForestClassifier()
grid = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, scoring = 'f1', refit = 'accuracy', n_jobs = -1 , cv = kfold, verbose = 2)
grid.fit(X, np.ravel(Y))
print(grid.best_params_) # gets the best hyper-parameters for random forest 

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END ....................max_depth=275, n_estimators=253; total time= 5.1min
[CV] END ....................max_depth=275, n_estimators=253; total time= 5.1min
[CV] END ....................max_depth=275, n_estimators=253; total time= 5.2min
[CV] END ....................max_depth=275, n_estimators=502; total time= 9.8min
[CV] END ....................max_depth=275, n_estimators=502; total time= 9.8min
[CV] END ....................max_depth=275, n_estimators=502; total time=10.1min
[CV] END ....................max_depth=275, n_estimators=751; total time=14.7min
[CV] END ....................max_depth=275, n_estimators=751; total time=14.7min
[CV] END ......................max_depth=350, n_estimators=5; total time=   7.9s
[CV] END ......................max_depth=350, n_estimators=5; total time=   8.1s
[CV] END ......................max_depth=350, n_estimators=5; total time=   7.4s
[CV] END ....................max_depth=275, n_es

In [None]:
# grid_prediction = grid.predict(X_train)
# score = f1_score(Y_train, grid_prediction)
# print(score)

# TO SUBMIT

In [37]:
y_predicted = grid.predict(submit_set_reduced)

In [38]:
# y_predicted = svc_model.predict(test_set_features)
y_predicted = pd.DataFrame(y_predicted, columns = ['label']) # convert y_predicted from nparray to pandas dataframe
y_predicted.insert(loc = 0, column = 'id', value = [i for i in range(17185, 17185 + 4296)]) # insert a column of the ids, starting from 17185
y_predicted.to_csv('skynet_submission3.csv', index = False) # output the predicted labels to ./skynet_submission.csv