# Support Vector Machine Learning Model (95% variance)

Import required packages

In [None]:
import numpy as np # for multi-dimensional array operations
import pandas as pd # for reading data from .csv files
from sklearn.svm import SVC # for support vector machine model
from sklearn.decomposition import PCA # for principle component analysis (dimensionality reduction)
from sklearn.model_selection import train_test_split # for splitting the dataset into training and testing sets
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, StratifiedKFold # for getting the best hyper parameters
from sklearn.preprocessing import MinMaxScaler # for scaling of data before PCA

Assign the training set and testing set to variables for easy reference

In [None]:
train_set = pd.read_csv('../../Training and Testing sets/train_tfidf_features.csv') # import the training set
test_set = pd.read_csv('../../Training and Testing sets/test_tfidf_features.csv') # import the testing set

Principal Component Analysis for train_set (95% variance)

In [None]:
train_set_label = train_set.loc[:, ["label"]]
features_names = [str(i) for i in range(0, 5000)]
train_set_features = train_set.loc[:, features_names] # train_set_features will not contain the label and id columns
test_set_features = test_set.loc[:, features_names] # test_set_features will not contain the label and id columns

train_set_features.shape

(17184, 5000)

In [None]:
# combine test and train
frames = [train_set_features,test_set_features]
to_reduce = pd.concat(frames)

# scale the dataset before PCA
scaler = MinMaxScaler()
traintest_to_reduce = scaler.fit_transform(to_reduce)

# perform PCA
pca = PCA(n_components = 0.95)
train_test_reduced = pca.fit_transform(traintest_to_reduce)
train_test_reduced = pd.DataFrame(data = traintest_to_reduce)
train_test_reduced

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21476,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21477,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
X_train = train_test_reduced.iloc[0:17184,:]
Y_train = train_set_label

X_test = train_test_reduced.iloc[17184:21480,:]

print(X_train.shape)
print(X_train.head(5))

print(Y_train.shape)
print(Y_train.head(5))

print(X_test.shape)
print(X_test.head(5))

(17184, 5000)
   0     1     2     3     4     5     6     7     8     9     ...  4990  \
0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
1   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
2   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
3   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
4   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   

   4991  4992  4993  4994  4995  4996  4997  4998  4999  
0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
1   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
2   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
3   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
4   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  

[5 rows x 5000 columns]
(17184, 1)
   label
0      1
1      0
2      1
3      0
4      1
(4296, 5000)
       0     1     2     3     4     5     6     7     8     9     ...  4990  

Principal Component Analysis for test_set (95% variance)

Tuning the hyper-parameters and training the model based on the best hyper-parameters

In [15]:
# hyper_parameters = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['linear', 'poly', 'sigmoid', 'rbf']} # initialise the hyper-parameters
hyper_parameters = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['rbf']} # initialise the hyper-parameters
kfold = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 0) # for 3-fold cross validation
# grid = GridSearchCV(SVC(), hyper_parameters, refit = True, verbose = 2) # cretae a GridSearchCV object to git to the taining data
grid = GridSearchCV(SVC(), param_grid = hyper_parameters, scoring = 'f1', refit = 'accuracy', n_jobs = 1 , cv = kfold, verbose = 2)
grid.fit(X_train, np.ravel(Y_train)) # training the model using the best hyper-parameters
print(grid.best_params_) # gets the best hyper-parameters for SVM

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=15.8min
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=16.2min
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=12.3min
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time= 8.6min
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time= 8.8min
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time= 8.8min
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time= 8.5min
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time= 8.5min
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time= 8.5min
[CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time=52.8min
[CV] END ....................C=0.1, gamma=0.001, kernel=rbf; total time=631.5min
[CV] END .....................C=0.1, gamma=0.001

Predicting the labels for the test dataset based on the model with the best hyper-parameters

In [16]:
y_predicted = grid.predict(X_test)

In [19]:
# y_predicted = svc_model.predict(test_set_features)
y_predicted = pd.DataFrame(y_predicted, columns = ['label']) # convert y_predicted from nparray to pandas dataframe
y_predicted.insert(loc = 0, column = 'id', value = [i for i in range(17185, 17185 + 4296)]) # insert a column of the ids, starting from 17185
y_predicted.to_csv('skynet_submission.csv', index = False) # output the predicted labels to ./skynet_submission.csv

# Support Vector Machine Learning Model (90% variance)

Import required packages

In [2]:
import numpy as np # for multi-dimensional array operations
import pandas as pd # for reading data from .csv files
from sklearn.svm import SVC # for support vector machine model
from sklearn.decomposition import PCA # for principle component analysis (dimensionality reduction)
from sklearn.model_selection import train_test_split # for splitting the dataset into training and testing sets
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, StratifiedKFold # for getting the best hyper parameters
from sklearn.preprocessing import MinMaxScaler # for scaling of data before PCA

Assign the training set and testing set to variables for easy reference

In [3]:
train_set = pd.read_csv('../../Training and Testing sets/train_tfidf_features.csv') # import the training set
test_set = pd.read_csv('../../Training and Testing sets/test_tfidf_features.csv') # import the testing set

Principal Component Analysis for train_set (95% variance)

In [4]:
train_set_label = train_set.loc[:, ["label"]]
features_names = [str(i) for i in range(0, 5000)]
train_set_features = train_set.loc[:, features_names] # train_set_features will not contain the label and id columns
test_set_features = test_set.loc[:, features_names] # test_set_features will not contain the label and id columns

train_set_features.shape

(17184, 5000)

In [5]:
# combine test and train
frames = [train_set_features,test_set_features]
to_reduce = pd.concat(frames)

# perform PCA
pca = PCA(n_components = 0.90)
train_test_reduced = pca.fit_transform(to_reduce)
train_test_reduced = pd.DataFrame(data = to_reduce)
train_test_reduced

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4291,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4293,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
X_train = train_test_reduced.iloc[0:17184,:]
Y_train = train_set_label

X_test = train_test_reduced.iloc[17184:21480,:]

print(X_train.shape)
print(X_train.head(5))

print(Y_train.shape)
print(Y_train.head(5))

print(X_test.shape)
print(X_test.head(5))

(17184, 5000)
     0    1    2    3    4    5    6    7    8    9  ...  4990  4991  4992  \
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   

   4993  4994  4995  4996  4997  4998  4999  
0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
1   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
2   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
3   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
4   0.0   0.0   0.0   0.0   0.0   0.0   0.0  

[5 rows x 5000 columns]
(17184, 1)
   label
0      1
1      0
2      1
3      0
4      1
(4296, 5000)
     0    1    2    3    4    5    6    7    8    9  ...  4990  4991  4992  \
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0

Principal Component Analysis for test_set (95% variance)

Tuning the hyper-parameters and training the model based on the best hyper-parameters

In [7]:
# hyper_parameters = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['linear', 'poly', 'sigmoid', 'rbf']} # initialise the hyper-parameters
hyper_parameters = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['rbf']} # initialise the hyper-parameters
kfold = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 0) # for 3-fold cross validation
# grid = GridSearchCV(SVC(), hyper_parameters, refit = True, verbose = 2) # cretae a GridSearchCV object to git to the taining data
grid = GridSearchCV(SVC(), param_grid = hyper_parameters, scoring = 'f1_macro', refit = 'f1_macro', n_jobs = 1 , cv = kfold, verbose = 2)
grid.fit(X_train, np.ravel(Y_train)) # training the model using the best hyper-parameters
print(grid.best_params_) # gets the best hyper-parameters for SVM

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 9.9min


Predicting the labels for the test dataset based on the model with the best hyper-parameters

In [None]:
y_predicted = grid.predict(X_test)

In [None]:
# y_predicted = svc_model.predict(test_set_features)
y_predicted = pd.DataFrame(y_predicted, columns = ['label']) # convert y_predicted from nparray to pandas dataframe
y_predicted.insert(loc = 0, column = 'id', value = [i for i in range(17185, 17185 + 4296)]) # insert a column of the ids, starting from 17185
y_predicted.to_csv('skynet_submission.csv', index = False) # output the predicted labels to ./skynet_submission.csv