# Ada Boost Machine Learning Model + N-gram

Import required packages

In [1]:
import numpy as np # for multi-dimensional array operations
import pandas as pd # for reading data from .csv files
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.decomposition import PCA # for principle component analysis (dimensionality reduction)
from sklearn.model_selection import train_test_split # for splitting the dataset into training and testing sets
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, StratifiedKFold # for getting the best hyper parameters
from sklearn.preprocessing import MinMaxScaler # for scaling of data before PCA
from sklearn.feature_extraction.text import TfidfVectorizer

from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import AdaBoostClassifier
from matplotlib import pyplot

Assign the training set and testing set to variables for easy reference

In [2]:
train_set = pd.read_csv('../../Training and Testing sets/train.csv') # import the training set
test_set = pd.read_csv('../../Training and Testing sets/test.csv') # import the testing set

In [3]:
train_set_label = train_set.loc[:, ["label"]]
train_words = train_set.drop(['id','label'], axis =1) # train_set_features will not contain the label and id columns
test_words = test_set.drop(['id'], axis =1)

# print(train_words.head(10))
# print(test_words.head(10))

frames = [train_words,test_set]
to_vector = pd.concat(frames)
# print(to_vector.shape)
# print(to_vector.head(5))

vectorizer2 = TfidfVectorizer(max_features=5000, ngram_range=(2, 2))
to_reduce = vectorizer2.fit_transform(to_vector['post'])
vectorizer2.get_feature_names_out()
print(type(to_reduce))
print(to_reduce.shape)
print(to_reduce.todense())

<class 'scipy.sparse._csr.csr_matrix'>
(21480, 5000)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [4]:

features_names = [str(i) for i in range(0, 5000)]

df = pd.DataFrame(to_reduce.todense(), columns = features_names)

print(df.head(10))
print(type(df))

     0    1    2    3    4    5    6    7    8    9  ...  4990  4991  4992  \
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
5  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
6  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
7  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
8  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
9  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   

   4993  4994  4995  4996  4997  4998  4999  
0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
1   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
2  

## PCA 90%

In [5]:
# perform PCA
pca = PCA(n_components = 0.90)
train_test_reduced = pca.fit_transform(df)
train_test_reduced = pd.DataFrame(data = train_test_reduced)
train_test_reduced

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3161,3162,3163,3164,3165,3166,3167,3168,3169,3170
0,-0.014941,-0.018820,-0.005642,-0.003160,-0.002568,-0.007729,0.002250,0.003700,0.000928,-0.009780,...,0.013244,-0.005175,0.009710,0.001519,0.025120,-0.012370,-0.003109,0.009207,0.019224,0.010008
1,-0.014186,-0.019200,-0.006263,0.000862,-0.003506,-0.007469,0.001890,0.003623,-0.001053,-0.009602,...,-0.015710,-0.013595,-0.008706,-0.003859,-0.008753,0.019053,-0.006945,-0.005274,-0.004522,0.003430
2,-0.017163,-0.022316,0.002719,-0.005427,-0.007365,-0.007633,0.003539,-0.009064,-0.001119,-0.012863,...,0.011127,-0.009287,-0.015277,0.000964,0.003077,0.007442,0.007025,-0.003996,0.002996,-0.004377
3,-0.022529,-0.008018,0.026099,-0.023266,0.017116,-0.045397,0.026235,0.158389,-0.060788,0.173287,...,0.016985,-0.015716,-0.015158,0.005747,0.007912,-0.022056,0.010080,0.005163,0.021510,0.027672
4,-0.014924,-0.021372,-0.019956,-0.015842,0.004789,-0.010231,-0.023476,-0.014723,-0.038557,-0.019589,...,-0.004645,-0.000388,0.002716,-0.013590,0.005778,0.002614,-0.005142,0.008499,-0.010395,0.002315
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21475,-0.013922,-0.020059,-0.008383,-0.007273,-0.002324,-0.009004,0.000398,0.005826,-0.003648,-0.010939,...,0.001368,0.002188,-0.002412,-0.003648,-0.000143,0.000720,0.000854,0.002170,0.001211,-0.001956
21476,-0.044723,0.258589,-0.054264,-0.063301,0.021689,0.044024,-0.108188,0.031136,0.020324,-0.008220,...,0.005639,-0.006759,0.004200,-0.010559,-0.001686,0.000380,0.004091,-0.012165,-0.000709,-0.011416
21477,-0.004025,-0.015866,-0.008282,-0.016841,-0.017031,0.001657,-0.013370,-0.022620,-0.008357,0.004496,...,0.005200,0.008936,0.011855,-0.002911,0.006393,0.001920,0.006204,0.013062,-0.008610,0.011284
21478,-0.002988,-0.020983,-0.008548,-0.008987,-0.004996,-0.008239,-0.004295,0.001019,-0.009165,-0.015085,...,0.004153,0.015697,0.006593,0.004705,0.015448,0.004966,0.012480,-0.003510,0.011379,-0.003338


In [6]:
X_train = train_test_reduced.iloc[0:17184,:]
Y_train = train_set_label

X_test = train_test_reduced.iloc[17184:21480,:]

print(X_train.shape)
print(X_train.head(5))

print(Y_train.shape)
print(Y_train.head(5))

print(X_test.shape)
print(X_test.head(5))

(17184, 3171)
       0         1         2         3         4         5         6     \
0 -0.014941 -0.018820 -0.005642 -0.003160 -0.002568 -0.007729  0.002250   
1 -0.014186 -0.019200 -0.006263  0.000862 -0.003506 -0.007469  0.001890   
2 -0.017163 -0.022316  0.002719 -0.005427 -0.007365 -0.007633  0.003539   
3 -0.022529 -0.008018  0.026099 -0.023266  0.017116 -0.045397  0.026235   
4 -0.014924 -0.021372 -0.019956 -0.015842  0.004789 -0.010231 -0.023476   

       7         8         9     ...      3161      3162      3163      3164  \
0  0.003700  0.000928 -0.009780  ...  0.013244 -0.005175  0.009710  0.001519   
1  0.003623 -0.001053 -0.009602  ... -0.015710 -0.013595 -0.008706 -0.003859   
2 -0.009064 -0.001119 -0.012863  ...  0.011127 -0.009287 -0.015277  0.000964   
3  0.158389 -0.060788  0.173287  ...  0.016985 -0.015716 -0.015158  0.005747   
4 -0.014723 -0.038557 -0.019589  ... -0.004645 -0.000388  0.002716 -0.013590   

       3165      3166      3167      3168      3169   

## Using GridSearchCV to find interaction effects

In [7]:
model = AdaBoostClassifier()
# define the grid of values to search
grid = dict()
grid['n_estimators'] = [100, 500]
grid['learning_rate'] = [0.1, 1.0, 2.0]
# define the evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define the grid search procedure
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1_macro', refit = 'f1_macro')
# execute the grid search
grid_result = grid_search.fit(X_train, np.ravel(Y_train))
# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# summarize all scores that were evaluated
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

KeyboardInterrupt: 

## Output Test Results (0.66135)

Predicting the labels for the test dataset based on the model with the best hyper-parameters

In [None]:
y_predicted = grid_result.predict(X_test)

In [None]:
# y_predicted = svc_model.predict(test_set_features)
y_predicted = pd.DataFrame(y_predicted, columns = ['label']) # convert y_predicted from nparray to pandas dataframe
y_predicted.insert(loc = 0, column = 'id', value = [i for i in range(17185, 17185 + 4296)]) # insert a column of the ids, starting from 17185
y_predicted.to_csv('ada_boost_ngram.csv', index = False) # output the predicted labels to ./skynet_submission.csv

## Principal Component Analysis for train_set (95% variance)

In [35]:
# combine test and train
frames = [train_set_features,test_set_features]
to_reduce = pd.concat(frames)

# scale the dataset before PCA
scaler = MinMaxScaler()
traintest_to_reduce = scaler.fit_transform(to_reduce)

# perform PCA
pca = PCA(n_components = 0.95)
train_test_reduced = pca.fit_transform(traintest_to_reduce)
train_test_reduced = pd.DataFrame(data = traintest_to_reduce)
train_test_reduced

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21476,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21477,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
X_train = train_test_reduced.iloc[0:17184,:]
Y_train = train_set_label

X_test = train_test_reduced.iloc[17184:21480,:]

print(X_train.shape)
print(X_train.head(5))

print(Y_train.shape)
print(Y_train.head(5))

print(X_test.shape)
print(X_test.head(5))

(17184, 5000)
   0     1     2     3     4     5     6     7     8     9     ...  4990  \
0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
1   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
2   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
3   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
4   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   

   4991  4992  4993  4994  4995  4996  4997  4998  4999  
0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
1   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
2   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
3   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
4   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  

[5 rows x 5000 columns]
(17184, 1)
   label
0      1
1      0
2      1
3      0
4      1
(4296, 5000)
       0     1     2     3     4     5     6     7     8     9     ...  4990  

## Using GridSearchCV to find interaction effects

In [38]:
model = AdaBoostClassifier()
# define the grid of values to search
grid = dict()
grid['n_estimators'] = [10, 50, 100, 500, 1000]
grid['learning_rate'] = [0.0001, 0.001, 0.01, 0.1, 1.0]
# define the evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define the grid search procedure
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1_macro', refit = 'f1_macro')
# execute the grid search
grid_result = grid_search.fit(X_train, np.ravel(Y_train))
# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# summarize all scores that were evaluated
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

## Output Test Results

Predicting the labels for the test dataset based on the model with the best hyper-parameters

In [None]:
y_predicted = grid_result.predict(X_test)

In [None]:
# y_predicted = svc_model.predict(test_set_features)
y_predicted = pd.DataFrame(y_predicted, columns = ['label']) # convert y_predicted from nparray to pandas dataframe
y_predicted.insert(loc = 0, column = 'id', value = [i for i in range(17185, 17185 + 4296)]) # insert a column of the ids, starting from 17185
y_predicted.to_csv('skynet_submission_new.csv', index = False) # output the predicted labels to ./skynet_submission.csv