# Support Vector Machine Learning Model

Import required packages

In [44]:
import numpy as np # for multi-dimensional array operations
import pandas as pd # for reading data from .csv files
from sklearn.svm import SVC # for support vector machine model
from sklearn.model_selection import train_test_split # for splitting the dataset into training and testing sets
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, StratifiedKFold # for getting the best hyper parameters

Assign the training set and testing set to variables for easy reference

In [45]:
train_set = pd.read_csv('./train_tfidf_features.csv') # import the training set
test_set = pd.read_csv('./test_tfidf_features.csv') # import the testing set

Principal Component Analysis for train_set

In [46]:
from sklearn.decomposition import PCA # for principle component analysis (dimensionality reduction)
label = train_set.loc[:, ["label"]]
features_names = [str(i) for i in range(0, 5000)]
features = train_set.loc[:, features_names]
pca = PCA(n_components = 500) # reduce the number of dimensions/features per datapoint to just 500
principalComponents = pca.fit_transform(features)
train_set_reduced = pd.DataFrame(data = principalComponents)
train_set_reduced

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,-0.083198,-0.016047,-0.010594,-0.001966,-0.013781,-0.010990,-0.009681,-0.007516,-0.022370,-0.023419,...,0.005077,0.016076,0.018282,-0.032241,-0.000503,-0.029610,-0.010225,-0.025306,-0.003552,-0.012009
1,-0.068421,-0.043649,-0.018444,-0.008227,-0.000050,-0.043175,0.127319,0.010289,0.014349,-0.005611,...,-0.008883,0.002221,-0.035011,-0.008898,0.004206,0.004611,0.010534,-0.021848,0.017258,0.002470
2,-0.080171,-0.044642,-0.015343,-0.008700,-0.010477,-0.057559,0.075618,0.115408,0.111483,0.070558,...,-0.018216,-0.006005,-0.043467,0.019423,-0.022703,0.034810,-0.012448,-0.003795,0.003304,0.008975
3,0.028600,-0.040406,0.002784,0.007088,0.000463,-0.031220,-0.137441,0.114586,-0.013426,0.077356,...,-0.024062,0.016777,-0.005527,-0.018254,-0.004904,-0.023072,-0.000010,-0.008012,-0.009122,0.003164
4,0.255054,-0.113418,-0.019237,-0.021470,-0.040031,-0.007003,-0.035958,0.008660,-0.026050,-0.003354,...,-0.008827,-0.013163,-0.010323,0.015123,-0.036983,0.002203,0.013546,-0.000237,0.011640,0.031710
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17179,0.075930,0.078879,0.167305,-0.099090,0.304845,0.037756,-0.022520,0.018484,-0.015022,-0.007000,...,-0.001265,-0.008033,0.024719,-0.018821,-0.012959,0.023933,0.001774,-0.013994,0.001551,0.025494
17180,-0.072104,-0.019731,-0.014654,-0.005451,-0.005690,-0.012309,0.005978,-0.009017,-0.003212,-0.014697,...,0.001279,0.020641,0.002591,0.067616,0.023154,-0.036807,0.021398,0.042167,0.002417,-0.026341
17181,0.002079,-0.041787,-0.016886,-0.008916,-0.018061,-0.020858,0.006685,-0.006928,-0.017611,0.010783,...,0.004074,0.018851,-0.005441,-0.035653,-0.006330,-0.000426,-0.004705,-0.023710,-0.016259,-0.013883
17182,0.091355,-0.055904,-0.008474,-0.017560,-0.002191,-0.013550,-0.012348,-0.003712,-0.017073,-0.010724,...,0.004458,0.015667,0.048468,-0.016216,-0.055463,-0.003743,-0.037074,-0.027250,0.004441,-0.004808


In [47]:
X = train_set_reduced
y = label

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20) # Train a SVC model using different kernel
X_train = X
y_train = y

In [48]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,-0.083198,-0.016047,-0.010594,-0.001966,-0.013781,-0.010990,-0.009681,-0.007516,-0.022370,-0.023419,...,0.005077,0.016076,0.018282,-0.032241,-0.000503,-0.029610,-0.010225,-0.025306,-0.003552,-0.012009
1,-0.068421,-0.043649,-0.018444,-0.008227,-0.000050,-0.043175,0.127319,0.010289,0.014349,-0.005611,...,-0.008883,0.002221,-0.035011,-0.008898,0.004206,0.004611,0.010534,-0.021848,0.017258,0.002470
2,-0.080171,-0.044642,-0.015343,-0.008700,-0.010477,-0.057559,0.075618,0.115408,0.111483,0.070558,...,-0.018216,-0.006005,-0.043467,0.019423,-0.022703,0.034810,-0.012448,-0.003795,0.003304,0.008975
3,0.028600,-0.040406,0.002784,0.007088,0.000463,-0.031220,-0.137441,0.114586,-0.013426,0.077356,...,-0.024062,0.016777,-0.005527,-0.018254,-0.004904,-0.023072,-0.000010,-0.008012,-0.009122,0.003164
4,0.255054,-0.113418,-0.019237,-0.021470,-0.040031,-0.007003,-0.035958,0.008660,-0.026050,-0.003354,...,-0.008827,-0.013163,-0.010323,0.015123,-0.036983,0.002203,0.013546,-0.000237,0.011640,0.031710
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17179,0.075930,0.078879,0.167305,-0.099090,0.304845,0.037756,-0.022520,0.018484,-0.015022,-0.007000,...,-0.001265,-0.008033,0.024719,-0.018821,-0.012959,0.023933,0.001774,-0.013994,0.001551,0.025494
17180,-0.072104,-0.019731,-0.014654,-0.005451,-0.005690,-0.012309,0.005978,-0.009017,-0.003212,-0.014697,...,0.001279,0.020641,0.002591,0.067616,0.023154,-0.036807,0.021398,0.042167,0.002417,-0.026341
17181,0.002079,-0.041787,-0.016886,-0.008916,-0.018061,-0.020858,0.006685,-0.006928,-0.017611,0.010783,...,0.004074,0.018851,-0.005441,-0.035653,-0.006330,-0.000426,-0.004705,-0.023710,-0.016259,-0.013883
17182,0.091355,-0.055904,-0.008474,-0.017560,-0.002191,-0.013550,-0.012348,-0.003712,-0.017073,-0.010724,...,0.004458,0.015667,0.048468,-0.016216,-0.055463,-0.003743,-0.037074,-0.027250,0.004441,-0.004808


In [49]:
y_train

Unnamed: 0,label
0,1
1,0
2,1
3,0
4,1
...,...
17179,0
17180,0
17181,1
17182,1


Principal Component Analysis for test_set

In [36]:
features_names = [str(i) for i in range(0, 5000)]
features = test_set.loc[:, features_names]
pca = PCA(n_components = 500) # reduce the number of dimensions/features per datapoint to just 500
principalComponents = pca.fit_transform(features)
test_set_features = pd.DataFrame(data = principalComponents)
test_set_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,-0.084340,0.004173,-0.015319,-0.021366,-0.010358,-0.014652,-0.026681,-0.016994,-0.028899,0.014583,...,0.002951,-0.014243,0.017159,-0.019520,-0.008874,-0.004951,-0.053947,0.014125,0.002412,0.028919
1,-0.076376,0.004489,0.002737,-0.005390,0.009629,-0.018248,-0.051312,-0.038985,-0.026563,-0.063067,...,0.023495,-0.002788,-0.001928,0.030939,0.009895,0.009982,0.020064,0.006486,-0.001268,-0.007082
2,0.166267,0.046473,-0.040004,-0.023973,-0.048641,0.028540,0.001565,-0.008240,-0.022018,0.003369,...,0.017628,-0.004600,-0.009961,0.031258,-0.003091,0.019574,0.022882,-0.009330,-0.020629,-0.038298
3,0.018457,0.182015,-0.004194,-0.011861,-0.057109,-0.038041,-0.013645,-0.041278,0.026412,-0.046098,...,-0.003559,-0.010726,0.004512,-0.007440,-0.014645,-0.003248,-0.009598,-0.005175,0.007516,0.004010
4,0.111689,-0.114309,0.039659,-0.004919,-0.023663,-0.113257,-0.119484,-0.171374,0.190625,-0.103496,...,-0.001603,-0.016680,0.036279,-0.009484,-0.032463,0.003783,-0.019534,-0.018490,0.032453,-0.007474
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4291,-0.078148,-0.012918,-0.021844,-0.017542,-0.010100,0.009124,0.032984,-0.010705,0.008925,-0.006379,...,-0.016900,0.020665,0.002850,-0.016364,-0.013699,0.009235,0.005940,-0.005581,-0.028452,0.042904
4292,0.051254,-0.062060,-0.013449,-0.003496,-0.022913,-0.011120,-0.025785,0.058009,-0.088495,0.098986,...,0.006708,-0.012454,-0.005344,-0.018606,0.040071,0.023253,-0.029817,0.009992,0.026537,-0.031569
4293,0.026759,-0.057289,-0.014442,-0.028254,-0.009941,-0.004797,-0.005752,0.033910,-0.077364,0.064250,...,-0.018858,0.011231,-0.001483,-0.025915,-0.027026,0.006852,0.009231,0.003352,0.007667,-0.006324
4294,0.207765,-0.128784,-0.020473,-0.014797,-0.018695,-0.004086,-0.019869,0.003138,-0.039710,0.023617,...,-0.000466,-0.034211,0.007368,0.025521,-0.013661,-0.010623,-0.004763,0.031829,-0.031044,-0.017654


In [None]:
# # prepare X_train and y_train
# X_train = train_set.drop(['id', 'label'], axis = 1) # X_train contains only the feature columns of train_set
# X_train.insert(loc = len(X_train.columns), column = 'b', value = 1) # insert a column of 1's for the offset/b
# y_train = train_set.drop(['id'], axis = 1).get(['label']) # y_train contains only the label column of train_set

# # prepare X_test
# X_test = test_set.drop(['id'], axis = 1) # X_test contains only the feature columns of test_set

In [None]:
# X = train_set.drop(['id', 'label'], axis = 1) # X_train contains only the feature columns of train_set
# y = train_set.get(['label']) # y_train contains only the label column of train_set

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20) # Train a SVC model using different kernel

Tuning the hyper-parameters

In [51]:
hyper_parameters = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['linear', 'poly', 'sigmoid', 'rbf']} # initialise the hyper-parameters
kfold = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 0) # for 3-fold cross validation
# grid = GridSearchCV(SVC(), hyper_parameters, refit = True, verbose = 2) # cretae a GridSearchCV object to git to the taining data
grid = GridSearchCV(SVC(), param_grid = hyper_parameters, scoring = 'accuracy', refit = 'accuracy', n_jobs = -1, cv = kfold, verbose = 2)
grid.fit(X_train, np.ravel(y_train))
print(grid.best_params_) # gets the best hyper-parameters for SVM

Fitting 3 folds for each of 64 candidates, totalling 192 fits
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=  56.0s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=  51.1s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=  52.4s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=  58.3s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=  55.2s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=  54.8s
[CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time=  48.0s
[CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time=  48.3s
[CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time=  51.3s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 1.2min
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 1.1min
[CV] END .........................C=0.1, gamma=

Training the model using the best estimators and hyper-parameters

In [11]:
grid_predicted = grid.predict(X_train)
print(classification_report(y_test, grid_predicted)) # output the performance of the model with the best estimators and hyper-parameters

              precision    recall  f1-score   support

           0       0.72      0.86      0.78      2131
           1       0.67      0.45      0.53      1306

    accuracy                           0.70      3437
   macro avg       0.69      0.65      0.66      3437
weighted avg       0.70      0.70      0.69      3437



In [52]:
svc_model = SVC(kernel = "rbf", C = 1, gamma = 1).fit(X_train, np.ravel(y_train)) # using the best hyper-parameters, train the SVC model
# y_predicted = svc_model.predict(X_test) # evaluate the SVC model
# print(f"Evaulation of the SVC with the best hyper-parameters:")
# print(classification_report(y_test, y_predicted))
# print(f'The accuracy score of the model is {svc_model.score(X_test, y_test):.4f}')

In [53]:
y_predicted = svc_model.predict(test_set_features)
y_predicted = pd.DataFrame(y_predicted, columns = ['label']) # convet y_predicted from nparray to pandas dataframe
y_predicted.insert(loc = 0, column = 'id', value = [i for i in range(17185, 17185 + 4296)]) # insert a column of the ids, starting from 17185
y_predicted.to_csv('skynet_submission.csv', index = False) # output the predicted labels to ./skynet_submission.csv