# Support Vector Machine Learning Model (95% variance - F1 Macro)

Import required packages

In [4]:
import numpy as np # for multi-dimensional array operations
import pandas as pd # for reading data from .csv files
from sklearn.svm import SVC # for support vector machine model
from sklearn.decomposition import PCA # for principle component analysis (dimensionality reduction)
from sklearn.model_selection import train_test_split # for splitting the dataset into training and testing sets
from sklearn.metrics import classification_report
# explicitly require this experimental feature
from sklearn.experimental import enable_halving_search_cv # noqa
from sklearn.model_selection import GridSearchCV, StratifiedKFold, HalvingGridSearchCV # for getting the best hyper parameters
from sklearn.preprocessing import MinMaxScaler # for scaling of data before PCA

Assign the training set and testing set to variables for easy reference

In [5]:
train_set = pd.read_csv('./train_tfidf_features.csv') # import the training set
test_set = pd.read_csv('./test_tfidf_features.csv') # import the testing set

Principal Component Analysis for train_set (95% variance)

In [6]:
train_set_label = train_set.loc[:, ["label"]]
features_names = [str(i) for i in range(0, 5000)]
train_set_features = train_set.loc[:, features_names] # train_set_features will not contain the label and id columns

# scale the dataset before PCA
scaler = MinMaxScaler()
train_set_rescaled = scaler.fit_transform(train_set_features)

# perform PCA
pca = PCA(n_components = 0.945)
train_set_reduced = pca.fit_transform(train_set_features)
train_set_reduced = pd.DataFrame(data = train_set_reduced)
train_set_reduced

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3809,3810,3811,3812,3813,3814,3815,3816,3817,3818
0,-0.089045,0.048751,-0.029639,-0.016700,0.000385,0.014316,-0.005539,-0.020306,0.004719,0.009897,...,-0.026000,-0.002195,0.000389,0.002635,0.008486,0.018608,0.005082,-0.015738,-0.018002,-0.016743
1,0.136756,0.266231,0.084889,0.031603,-0.018211,0.004293,-0.024719,0.006229,0.009211,0.000019,...,0.012922,0.008160,-0.002299,0.019190,0.014701,-0.013898,0.012192,0.019271,0.021400,-0.017876
2,0.104944,0.259393,0.072972,0.027429,-0.005477,0.009596,-0.037184,0.104340,-0.074970,-0.036528,...,0.009701,0.010089,0.000445,0.000213,-0.002285,0.011029,0.002397,-0.016482,-0.009815,-0.002239
3,0.001305,-0.083892,0.167686,-0.147106,0.104934,0.029242,0.031965,0.123423,-0.100359,0.018432,...,-0.005078,-0.019739,-0.013763,-0.015577,0.009132,-0.001387,0.025391,-0.005705,0.013651,0.006497
4,0.187432,-0.142138,-0.114244,-0.109001,0.015941,0.019497,-0.008227,-0.003423,-0.025579,0.007646,...,0.000259,0.012444,0.010043,0.013544,0.010150,0.003152,0.002576,-0.007717,-0.014211,0.002755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17179,0.038280,-0.155452,0.270500,0.020913,0.117433,-0.270079,0.253125,0.000118,-0.040224,0.017788,...,-0.002001,0.009512,0.002843,0.007488,-0.004243,0.002321,0.002983,-0.009121,-0.003093,0.005471
17180,-0.048311,0.053145,-0.021072,-0.022911,-0.004067,0.008960,0.008522,-0.018385,-0.000937,-0.025450,...,0.004036,-0.001961,0.004480,0.006522,-0.001036,-0.007830,0.005539,-0.022328,-0.005332,0.003683
17181,0.001391,0.000021,-0.049626,-0.042510,-0.002429,0.018796,-0.003226,-0.021145,-0.004623,-0.006278,...,0.033061,-0.046347,0.015143,0.015281,0.030695,-0.006510,0.000177,0.032740,0.009276,0.022124
17182,0.067931,-0.057538,-0.048988,-0.025669,0.010751,-0.035416,0.051602,0.013070,0.000936,-0.017790,...,0.001461,-0.006405,0.001651,-0.001430,-0.002094,0.007946,-0.011435,0.004336,-0.003766,-0.005196


In [7]:
X = train_set_reduced
y = train_set_label

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20) # Train a SVC model using different kernel
X_train = X
y_train = y

In [8]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3809,3810,3811,3812,3813,3814,3815,3816,3817,3818
0,-0.089045,0.048751,-0.029639,-0.016700,0.000385,0.014316,-0.005539,-0.020306,0.004719,0.009897,...,-0.026000,-0.002195,0.000389,0.002635,0.008486,0.018608,0.005082,-0.015738,-0.018002,-0.016743
1,0.136756,0.266231,0.084889,0.031603,-0.018211,0.004293,-0.024719,0.006229,0.009211,0.000019,...,0.012922,0.008160,-0.002299,0.019190,0.014701,-0.013898,0.012192,0.019271,0.021400,-0.017876
2,0.104944,0.259393,0.072972,0.027429,-0.005477,0.009596,-0.037184,0.104340,-0.074970,-0.036528,...,0.009701,0.010089,0.000445,0.000213,-0.002285,0.011029,0.002397,-0.016482,-0.009815,-0.002239
3,0.001305,-0.083892,0.167686,-0.147106,0.104934,0.029242,0.031965,0.123423,-0.100359,0.018432,...,-0.005078,-0.019739,-0.013763,-0.015577,0.009132,-0.001387,0.025391,-0.005705,0.013651,0.006497
4,0.187432,-0.142138,-0.114244,-0.109001,0.015941,0.019497,-0.008227,-0.003423,-0.025579,0.007646,...,0.000259,0.012444,0.010043,0.013544,0.010150,0.003152,0.002576,-0.007717,-0.014211,0.002755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17179,0.038280,-0.155452,0.270500,0.020913,0.117433,-0.270079,0.253125,0.000118,-0.040224,0.017788,...,-0.002001,0.009512,0.002843,0.007488,-0.004243,0.002321,0.002983,-0.009121,-0.003093,0.005471
17180,-0.048311,0.053145,-0.021072,-0.022911,-0.004067,0.008960,0.008522,-0.018385,-0.000937,-0.025450,...,0.004036,-0.001961,0.004480,0.006522,-0.001036,-0.007830,0.005539,-0.022328,-0.005332,0.003683
17181,0.001391,0.000021,-0.049626,-0.042510,-0.002429,0.018796,-0.003226,-0.021145,-0.004623,-0.006278,...,0.033061,-0.046347,0.015143,0.015281,0.030695,-0.006510,0.000177,0.032740,0.009276,0.022124
17182,0.067931,-0.057538,-0.048988,-0.025669,0.010751,-0.035416,0.051602,0.013070,0.000936,-0.017790,...,0.001461,-0.006405,0.001651,-0.001430,-0.002094,0.007946,-0.011435,0.004336,-0.003766,-0.005196


In [9]:
y_train

Unnamed: 0,label
0,1
1,0
2,1
3,0
4,1
...,...
17179,0
17180,0
17181,1
17182,1


Principal Component Analysis for test_set (95% variance)

In [10]:
features_names = [str(i) for i in range(0, 5000)]
test_set_features = test_set.loc[:, features_names] # test_set_features will not contain the label and id columns

# scale the dataset before PCA
test_set_rescaled = scaler.transform(test_set_features)

# perform PCA
test_set_reduced = pca.transform(test_set_rescaled) # use the pca from the train_set?
test_set_features = pd.DataFrame(data = test_set_reduced)
test_set_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3809,3810,3811,3812,3813,3814,3815,3816,3817,3818
0,-0.081068,0.033233,-0.019995,-0.014871,-0.008964,-0.007514,-0.002996,0.007308,-0.001920,-0.002989,...,-0.010054,-0.017378,-0.012351,-0.002780,0.019733,0.018493,-0.019640,0.000990,0.011819,0.002425
1,-0.086624,0.020043,0.024042,-0.012033,-0.029328,-0.030119,-0.006524,0.128263,0.198545,0.024362,...,-0.008156,0.011752,-0.003347,-0.039868,0.009646,-0.005097,-0.006608,0.005048,-0.013436,-0.000934
2,0.105919,-0.120204,-0.035468,-0.010114,-0.089316,0.003671,-0.076981,-0.050249,-0.025736,-0.015027,...,0.003587,-0.009791,-0.004208,-0.004251,0.004671,0.002180,-0.010712,0.002535,0.000266,-0.016713
3,-0.050736,-0.078060,0.070735,0.097212,-0.109151,-0.027417,-0.073181,-0.025870,-0.029943,-0.008712,...,0.012040,0.001787,0.007088,0.010763,0.004289,-0.004951,-0.003981,-0.006241,0.002142,0.001409
4,0.093150,-0.070280,-0.081980,-0.054623,0.070388,0.045546,0.004242,0.050559,0.002602,0.026238,...,0.009119,0.002461,-0.007150,-0.007980,-0.005396,-0.007949,-0.008624,0.002170,-0.000548,-0.001819
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4291,-0.049925,0.070971,-0.023278,-0.018502,-0.008462,0.008088,-0.002524,-0.026954,0.011244,-0.003809,...,-0.013382,-0.000878,-0.004948,0.001110,0.002738,0.005927,0.004945,-0.003275,0.006430,0.004915
4292,0.040870,-0.063082,-0.038621,-0.079293,0.025796,-0.006069,0.020070,-0.014328,0.026219,-0.054578,...,-0.043663,-0.008128,-0.042532,-0.010369,-0.026542,0.021137,0.010015,0.015955,-0.014187,0.005504
4293,0.031062,-0.029394,-0.058545,-0.063366,0.013945,-0.006945,0.020064,-0.032653,0.029020,-0.043405,...,-0.000809,-0.005675,0.018637,-0.003693,-0.012020,0.001083,0.027845,0.001219,-0.010213,-0.008297
4294,0.161894,-0.118229,-0.104541,-0.091072,0.017476,0.020430,-0.007549,-0.013663,-0.016050,-0.001027,...,0.031210,-0.026991,0.043521,-0.044251,-0.041449,-0.001702,-0.017706,0.063675,0.011047,-0.035621


Tuning the hyper-parameters and training the model based on the best hyper-parameters

In [11]:
# hyper_parameters = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['linear', 'poly', 'sigmoid', 'rbf']} # initialise the hyper-parameters
hyper_parameters = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['rbf']} # initialise the hyper-parameters
kfold = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 0) # for 3-fold cross validation
# grid = GridSearchCV(SVC(), hyper_parameters, refit = True, verbose = 2) # cretae a GridSearchCV object to git to the taining data
grid = HalvingGridSearchCV(SVC(), param_grid = hyper_parameters, scoring = 'f1_macro', refit = 'f1_macro', n_jobs = 1 , cv = kfold, verbose = 2)
grid.fit(X_train, np.ravel(y_train)) # training the model using the best hyper-parameters
print(grid.best_params_) # gets the best hyper-parameters for SVM

n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 1909
max_resources_: 17184
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 16
n_resources: 1909
Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   2.8s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   2.7s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   3.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   2.6s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   2.6s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   2.6s
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=   2.6s
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=   2.6s
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=   2.5s
[CV] EN

Predicting the labels for the test dataset based on the model with the best hyper-parameters

In [12]:
y_predicted = grid.predict(test_set_features)

In [13]:
# y_predicted = svc_model.predict(test_set_features)
y_predicted = pd.DataFrame(y_predicted, columns = ['label']) # convert y_predicted from nparray to pandas dataframe
y_predicted.insert(loc = 0, column = 'id', value = [i for i in range(17185, 17185 + 4296)]) # insert a column of the ids, starting from 17185
y_predicted.to_csv('skynet_submission_scaled_0955_f1macro.csv', index = False) # output the predicted labels to ./skynet_submission_95_f1macro.csv