In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import neighbors
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from collections import Counter


In [5]:
df = pd.read_csv('train_PTIT.csv')
df

Unnamed: 0,w_0,w_1,w_2,w_3,w_4,w_5,w_6,w_7,w_8,w_9,...,w_1424,w_1425,w_1426,w_1427,w_1428,w_1429,w_1430,w_1431,w_1432,subject
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Probabilistic_Methods
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Neural_Networks
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,Genetic_Algorithms
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Rule_Learning
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Genetic_Algorithms
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2432,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Genetic_Algorithms
2433,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Theory
2434,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Probabilistic_Methods
2435,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Neural_Networks


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    df.iloc[:, : -1], df.iloc[:, -1], test_size = 0.3, random_state = 29
)

### KNN

In [7]:
clf = neighbors.KNeighborsClassifier(n_neighbors = 5, weights = 'distance', metric='euclidean')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("KNN accuracy: ", accuracy_score(y_test, y_pred))
print("KNN f1 score: ", f1_score(y_test, y_pred, average='weighted'))
print("KNN confusion matrix: ")
print(confusion_matrix(y_test, y_pred))

KNN accuracy:  0.4959016393442623
KNN f1 score:  0.4735563016648353
KNN confusion matrix: 
[[ 25   3  35   4   2   0   6]
 [  1  80  22   2   0   0   7]
 [  5  29 152  15   1   2  15]
 [  1  14  36  60   0   1  14]
 [  1   5  42   2   8   0   4]
 [  5   4  18   2   2   6   9]
 [  3  12  34   1   5   5  32]]


### Logistic Regression

In [8]:
rg = LogisticRegression()
rg.fit(X_train, y_train)
y_predict = rg.predict(X_test)
score = accuracy_score(y_predict, y_test)
print("LR classification accuracy : ", score)
cm = confusion_matrix(y_test, y_predict)
print(cm)
print("LR f1 score", f1_score(y_test, y_predict, average="macro"))


LR classification accuracy :  0.7349726775956285
[[ 48   1  10   2   2   3   9]
 [  3  85  14   2   4   3   1]
 [  5   3 184  13   3   1  10]
 [  4   0  22  91   1   0   8]
 [  3   3  10   1  38   0   7]
 [ 10   0   2   1   0  28   5]
 [  5   0  18   3   0   2  64]]
LR f1 score 0.715603846262539


### SVM

In [10]:
clf = SVC(kernel = 'linear')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("SVM accuracy: ", accuracy_score(y_test, y_pred))
print("SVM f1 score: ", f1_score(y_test, y_pred, average='weighted'))
print("SVM confusion matrix: ")
print(confusion_matrix(y_test, y_pred))

SVM accuracy:  0.6857923497267759
SVM f1 score:  0.6867492289346998
SVM confusion matrix: 
[[ 50   3   8   2   4   3   5]
 [  4  84  10   4   6   1   3]
 [  9   6 165  17   5   5  12]
 [  5   0  26  85   1   1   8]
 [  3   4   9   2  38   1   5]
 [ 12   1   1   1   0  25   6]
 [  6   2  18   3   2   6  55]]


In [13]:
# turning hyperparameter for svm
from sklearn.model_selection import GridSearchCV

# defining parameter range
param_grid = {'C': [0.1, 1, 10],
                'gamma': [1, 0.1, 0.01],
                'kernel': ['rbf', 'linear', 'poly']}

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)

# fitting the model for grid search
grid.fit(X_train, y_train)

# print best parameter after tuning
print(grid.best_params_)
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)
# print accuracy score

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.308 total time=   6.1s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.308 total time=   5.8s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.305 total time=   5.0s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.305 total time=   4.8s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.305 total time=   4.3s
[CV 1/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.760 total time=   2.1s
[CV 2/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.751 total time=   2.0s
[CV 3/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.771 total time=   2.0s
[CV 4/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.748 total time=   1.9s
[CV 5/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.716 total time=   2.1s
[CV 1/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.545 total time=   3.7s
[CV 2/5] END .......C=0.1, gamma=1, kernel=poly

In [14]:
print("SVM accuracy: ", accuracy_score(y_test, grid.predict(X_test)))
print("SVM f1 score: ", f1_score(y_test, grid.predict(X_test), average='weighted'))
print("SVM confusion matrix: ")
print(confusion_matrix(y_test, grid.predict(X_test)))

SVM accuracy:  0.7281420765027322
SVM f1 score:  0.7277286225143963
SVM confusion matrix: 
[[ 48   1  14   1   2   2   7]
 [  2  85  16   2   4   2   1]
 [  4   3 188  11   1   1  11]
 [  3   1  26  89   2   1   4]
 [  4   2   9   1  40   0   6]
 [ 11   0   6   0   0  24   5]
 [  3   0  22   6   1   1  59]]


### Random Forests

In [15]:
from sklearn.ensemble import RandomForestClassifier  
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score


In [18]:
clf = RandomForestClassifier(n_estimators = 100, random_state = 5)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("RF accuracy: ", accuracy_score(y_test, y_pred))
print("RF f1 score: ", f1_score(y_test, y_pred, average='weighted'))
print("RF confusion matrix: ")
print(confusion_matrix(y_test, y_pred))

RF accuracy:  0.7336065573770492
RF f1 score:  0.7320735257355077
RF confusion matrix: 
[[ 48   5  12   2   1   1   6]
 [  1  90  14   2   1   0   4]
 [  5   3 188  12   0   1  10]
 [  3   0  22  93   0   2   6]
 [  4   3  14   1  38   0   2]
 [  4   1   9   0   0  26   6]
 [  3   3  19  10   0   3  54]]


In [19]:
# hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [100,200,300]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
                'max_features': max_features,
                'min_samples_split': min_samples_split}

# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train, y_train)



Fitting 3 folds for each of 18 candidates, totalling 54 fits


  warn(


In [20]:
# print best parameter after tuning
print(rf_random.best_params_)
# print how our model looks after hyper-parameter tuning
print(rf_random.best_estimator_)
# print accuracy score
print("RF accuracy: ", accuracy_score(y_test, rf_random.predict(X_test)))
print("RF f1 score: ", f1_score(y_test, rf_random.predict(X_test), average='weighted'))
print("RF confusion matrix: ")
print(confusion_matrix(y_test, rf_random.predict(X_test)))

{'n_estimators': 300, 'min_samples_split': 2, 'max_features': 'auto'}
RandomForestClassifier(max_features='auto', n_estimators=300, random_state=5)
RF accuracy:  0.7377049180327869
RF f1 score:  0.7368684009953421
RF confusion matrix: 
[[ 47   3  13   2   2   1   7]
 [  1  89  17   1   1   0   3]
 [  5   2 187  13   0   1  11]
 [  2   0  23  95   0   2   4]
 [  4   3  13   2  36   0   4]
 [  4   0   9   1   0  26   6]
 [  3   2  17   8   0   2  60]]
