# Support Vector Machine Learning Model (95% variance - F1 Macro)

Import required packages

In [1]:
import numpy as np # for multi-dimensional array operations
import pandas as pd # for reading data from .csv files
from sklearn.svm import SVC # for support vector machine model
from sklearn.decomposition import PCA # for principle component analysis (dimensionality reduction)
from sklearn.model_selection import train_test_split # for splitting the dataset into training and testing sets
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, StratifiedKFold # for getting the best hyper parameters
from sklearn.preprocessing import MinMaxScaler # for scaling of data before PCA

Assign the training set and testing set to variables for easy reference

In [2]:
train_set = pd.read_csv('../train_tfidf_features.csv') # import the training set
test_set = pd.read_csv('../test_tfidf_features.csv') # import the testing set

Principal Component Analysis for train_set (95% variance)

In [3]:
train_set_label = train_set.loc[:, ["label"]]
features_names = [str(i) for i in range(0, 5000)]
train_set_features = train_set.loc[:, features_names] # train_set_features will not contain the label and id columns

# scale the dataset before PCA
scaler = MinMaxScaler()
train_set_rescaled = scaler.fit_transform(train_set_features)

# perform PCA
pca = PCA(n_components = 0.95)
train_set_reduced = pca.fit_transform(train_set_features)
train_set_reduced = pd.DataFrame(data = train_set_reduced)
train_set_reduced

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3430,3431,3432,3433,3434,3435,3436,3437,3438,3439
0,-0.083198,-0.016048,-0.010595,-0.001965,-0.013779,-0.010989,-0.009680,-0.007520,-0.022370,-0.023418,...,-0.006348,-0.013092,-0.007520,0.007169,-0.000430,0.005640,-0.002805,-0.000875,0.002658,-0.017427
1,-0.068421,-0.043649,-0.018443,-0.008228,-0.000051,-0.043177,0.127322,0.010281,0.014349,-0.005608,...,-0.004531,-0.004845,0.002145,-0.004190,0.004226,0.001238,0.005315,0.000157,0.000655,0.000335
2,-0.080171,-0.044642,-0.015342,-0.008697,-0.010481,-0.057562,0.075615,0.115411,0.111499,0.070551,...,0.004945,0.003459,-0.009151,0.004732,0.001592,0.007137,-0.001514,0.008206,-0.003954,-0.000082
3,0.028600,-0.040406,0.002784,0.007087,0.000462,-0.031219,-0.137443,0.114581,-0.013427,0.077351,...,-0.005964,0.000969,-0.002013,0.001720,0.006397,0.001762,-0.007946,-0.013224,-0.001303,0.002237
4,0.255054,-0.113418,-0.019237,-0.021468,-0.040033,-0.007000,-0.035960,0.008656,-0.026048,-0.003353,...,0.000655,0.001680,0.009195,0.010899,0.001791,-0.027552,0.005058,0.001372,-0.003934,0.002632
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17179,0.075930,0.078879,0.167305,-0.099091,0.304845,0.037762,-0.022524,0.018488,-0.015024,-0.006992,...,0.000585,-0.000455,0.003488,0.004569,-0.002960,-0.002680,-0.001122,-0.007242,-0.006421,-0.000927
17180,-0.072104,-0.019731,-0.014655,-0.005451,-0.005682,-0.012314,0.005986,-0.009017,-0.003207,-0.014674,...,-0.000600,-0.008509,-0.007095,-0.015199,-0.000780,0.008921,-0.000302,0.008167,0.008050,0.006510
17181,0.002079,-0.041786,-0.016886,-0.008919,-0.018063,-0.020858,0.006685,-0.006916,-0.017607,0.010785,...,-0.003181,0.013946,-0.000212,-0.001618,-0.015307,0.008339,-0.003432,-0.000817,0.006270,0.008490
17182,0.091355,-0.055903,-0.008473,-0.017559,-0.002191,-0.013549,-0.012348,-0.003722,-0.017070,-0.010729,...,0.000324,0.000038,0.002041,0.000776,-0.000177,-0.004451,-0.003045,0.000156,0.003419,-0.002742


In [4]:
X = train_set_reduced
y = train_set_label

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20) # Train a SVC model using different kernel
X_train = X
y_train = y

In [5]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3430,3431,3432,3433,3434,3435,3436,3437,3438,3439
0,-0.083198,-0.016048,-0.010595,-0.001965,-0.013779,-0.010989,-0.009680,-0.007520,-0.022370,-0.023418,...,-0.006348,-0.013092,-0.007520,0.007169,-0.000430,0.005640,-0.002805,-0.000875,0.002658,-0.017427
1,-0.068421,-0.043649,-0.018443,-0.008228,-0.000051,-0.043177,0.127322,0.010281,0.014349,-0.005608,...,-0.004531,-0.004845,0.002145,-0.004190,0.004226,0.001238,0.005315,0.000157,0.000655,0.000335
2,-0.080171,-0.044642,-0.015342,-0.008697,-0.010481,-0.057562,0.075615,0.115411,0.111499,0.070551,...,0.004945,0.003459,-0.009151,0.004732,0.001592,0.007137,-0.001514,0.008206,-0.003954,-0.000082
3,0.028600,-0.040406,0.002784,0.007087,0.000462,-0.031219,-0.137443,0.114581,-0.013427,0.077351,...,-0.005964,0.000969,-0.002013,0.001720,0.006397,0.001762,-0.007946,-0.013224,-0.001303,0.002237
4,0.255054,-0.113418,-0.019237,-0.021468,-0.040033,-0.007000,-0.035960,0.008656,-0.026048,-0.003353,...,0.000655,0.001680,0.009195,0.010899,0.001791,-0.027552,0.005058,0.001372,-0.003934,0.002632
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17179,0.075930,0.078879,0.167305,-0.099091,0.304845,0.037762,-0.022524,0.018488,-0.015024,-0.006992,...,0.000585,-0.000455,0.003488,0.004569,-0.002960,-0.002680,-0.001122,-0.007242,-0.006421,-0.000927
17180,-0.072104,-0.019731,-0.014655,-0.005451,-0.005682,-0.012314,0.005986,-0.009017,-0.003207,-0.014674,...,-0.000600,-0.008509,-0.007095,-0.015199,-0.000780,0.008921,-0.000302,0.008167,0.008050,0.006510
17181,0.002079,-0.041786,-0.016886,-0.008919,-0.018063,-0.020858,0.006685,-0.006916,-0.017607,0.010785,...,-0.003181,0.013946,-0.000212,-0.001618,-0.015307,0.008339,-0.003432,-0.000817,0.006270,0.008490
17182,0.091355,-0.055903,-0.008473,-0.017559,-0.002191,-0.013549,-0.012348,-0.003722,-0.017070,-0.010729,...,0.000324,0.000038,0.002041,0.000776,-0.000177,-0.004451,-0.003045,0.000156,0.003419,-0.002742


In [6]:
y_train

Unnamed: 0,label
0,1
1,0
2,1
3,0
4,1
...,...
17179,0
17180,0
17181,1
17182,1


Principal Component Analysis for test_set (95% variance)

In [7]:
features_names = [str(i) for i in range(0, 5000)]
test_set_features = test_set.loc[:, features_names] # test_set_features will not contain the label and id columns

# scale the dataset before PCA
test_set_rescaled = scaler.transform(test_set_features)

# perform PCA
test_set_reduced = pca.transform(test_set_features) # use the pca from the train_set?
test_set_features = pd.DataFrame(data = test_set_reduced)
test_set_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3430,3431,3432,3433,3434,3435,3436,3437,3438,3439
0,-0.082801,-0.009343,-0.013543,-0.013317,-0.007984,-0.002459,-0.026278,0.004096,-0.014072,-0.020606,...,-0.009129,-0.000331,-0.004346,0.000342,-0.003079,-0.005185,0.002152,-0.001401,0.001859,-0.004974
1,-0.075595,0.005040,-0.017771,-0.014935,-0.000467,-0.001601,-0.055948,-0.017461,-0.031185,-0.017278,...,-0.009430,-0.001496,0.010692,0.002876,-0.009710,0.001454,-0.000840,-0.001868,0.010864,0.010155
2,0.163876,0.059162,-0.057706,-0.040884,-0.063164,-0.003223,0.020473,-0.005967,-0.005539,-0.008680,...,-0.002819,0.004451,0.002260,-0.000053,-0.009128,-0.001057,0.001149,-0.003500,-0.007828,-0.004243
3,-0.006835,0.189357,-0.044389,-0.024452,-0.043835,-0.013077,0.001344,0.000674,-0.005021,-0.028871,...,-0.002983,0.012031,-0.001850,-0.002848,0.001118,-0.001196,-0.000045,0.002646,0.003816,-0.002800
4,0.120704,-0.089485,0.015896,0.030057,-0.018015,-0.057111,-0.072387,0.053490,-0.153091,0.026020,...,0.003516,0.004251,0.000932,0.000547,0.001425,0.004522,0.004244,0.001834,-0.002758,0.004970
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4291,-0.076808,-0.024913,-0.014752,-0.008106,-0.009817,-0.013137,0.026288,-0.012828,-0.004845,-0.010727,...,-0.003375,0.000882,0.001384,0.006656,-0.001810,0.000964,-0.001453,0.001886,0.003305,0.003652
4292,0.069331,-0.055078,-0.008249,-0.022183,-0.000882,0.024624,-0.085988,-0.152147,0.159634,0.018128,...,-0.012540,-0.000104,0.008667,-0.000138,0.004133,-0.000909,0.006222,0.008371,0.000973,0.000169
4293,0.042192,-0.056123,-0.009015,-0.022608,-0.000348,0.011589,-0.046442,-0.129357,0.126640,0.010080,...,0.007049,-0.006831,-0.006951,-0.016390,-0.008177,-0.006095,0.013698,0.009730,0.000655,0.004273
4294,0.220417,-0.101562,-0.013420,-0.014971,-0.033857,-0.007979,-0.018445,-0.004240,-0.019587,-0.000660,...,0.002714,-0.016560,-0.005999,-0.005055,0.011844,-0.004319,-0.003595,0.001063,-0.007773,0.007764


Tuning the hyper-parameters and training the model based on the best hyper-parameters

In [8]:
# hyper_parameters = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['linear', 'poly', 'sigmoid', 'rbf']} # initialise the hyper-parameters
hyper_parameters = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['rbf']} # initialise the hyper-parameters
kfold = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 0) # for 3-fold cross validation
# grid = GridSearchCV(SVC(), hyper_parameters, refit = True, verbose = 2) # cretae a GridSearchCV object to git to the taining data
grid = GridSearchCV(SVC(), param_grid = hyper_parameters, scoring = 'f1_macro', refit = 'f1_macro', n_jobs = 1 , cv = kfold, verbose = 2)
grid.fit(X_train, np.ravel(y_train)) # training the model using the best hyper-parameters
print(grid.best_params_) # gets the best hyper-parameters for SVM

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 6.6min
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 8.1min
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 7.5min
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time= 7.0min
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time= 8.0min
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time= 8.8min
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time= 6.8min
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time= 6.6min
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time= 8.2min
[CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time= 6.6min
[CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time= 6.3min
[CV] END .....................C=0.1, gamma=0.001

Predicting the labels for the test dataset based on the model with the best hyper-parameters

In [9]:
y_predicted = grid.predict(test_set_features)

In [10]:
# y_predicted = svc_model.predict(test_set_features)
y_predicted = pd.DataFrame(y_predicted, columns = ['label']) # convert y_predicted from nparray to pandas dataframe
y_predicted.insert(loc = 0, column = 'id', value = [i for i in range(17185, 17185 + 4296)]) # insert a column of the ids, starting from 17185
y_predicted.to_csv('skynet_submission_95_f1macro.csv', index = False) # output the predicted labels to ./skynet_submission_95_f1macro.csv