In [35]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, recall_score,precision_score, classification_report, confusion_matrix
import collections
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_recall_curve, roc_curve
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import label_binarize

np.random.seed(1337)  # for reproducibility

In [36]:

def load_dataset(file_train, file_validate, file_test):
    df_train = pd.read_excel(file_train,header=None)

    X_train, Y_train = format_data(df_train)

    df_test = pd.read_excel(file_test,header=None)

    X_test, Y_test = format_data(df_test)



    df_validate = pd.read_excel(file_validate,header=None)

    X_validate, Y_validate = format_data(df_validate)



    return X_train, Y_train, X_validate, Y_validate, X_test, Y_test


def format_data(df):
    X = np.array(df.drop(df.columns[-1], 1))


    Y = np.array(df[df.columns[-1]])

    
    Y=Y-1

    return X,Y


In [37]:
train_set_file = 'trainset_P135_60.xls'

validate_set_file = 'cvset_P12345_20.xls'

test_set_file = 'testset_P12345_20.xls'


X_train, Y_train, X_validate, Y_validate, X_test, Y_test = load_dataset(train_set_file, validate_set_file , test_set_file)



In [38]:
print(X_train.shape)
print(Y_train.shape)

print(X_test.shape)
print(Y_test.shape)

print(X_validate.shape)
print(Y_validate.shape)

(19776, 42)
(19776,)
(7040, 42)
(7040,)
(7040, 42)
(7040,)


In [39]:
no_of_classes=np.unique(Y_train).size
no_of_classes

8

In [40]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_validate = scaler.fit_transform(X_validate)
X_test = scaler.fit_transform(X_test)

In [8]:
param_grid={'C': [0.1, 1, 100, 1000],'gamma': ['scale','auto', 0.00001, 0.0001, 0.001,0.01,0.1,1,5,10]}

In [9]:
svm_clf=SVC(kernel='rbf')

In [13]:
grid_search = GridSearchCV(svm_clf, param_grid, scoring='accuracy', cv = 5, verbose=5,return_train_score=True)

In [14]:
grid_search.fit(X_train,Y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV] C=0.1, gamma=scale ..............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  C=0.1, gamma=scale, score=(train=0.981, test=0.896), total=  10.8s
[CV] C=0.1, gamma=scale ..............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   29.2s remaining:    0.0s


[CV]  C=0.1, gamma=scale, score=(train=0.982, test=0.977), total=  15.4s
[CV] C=0.1, gamma=scale ..............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.0min remaining:    0.0s


[CV]  C=0.1, gamma=scale, score=(train=0.982, test=0.969), total=  17.1s
[CV] C=0.1, gamma=scale ..............................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.6min remaining:    0.0s


[CV]  C=0.1, gamma=scale, score=(train=0.981, test=0.977), total=  13.3s
[CV] C=0.1, gamma=scale ..............................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.1min remaining:    0.0s


[CV]  C=0.1, gamma=scale, score=(train=0.987, test=0.896), total=  10.0s
[CV] C=0.1, gamma=auto ...............................................
[CV]  C=0.1, gamma=auto, score=(train=0.981, test=0.897), total=  12.8s
[CV] C=0.1, gamma=auto ...............................................
[CV]  C=0.1, gamma=auto, score=(train=0.982, test=0.977), total=  12.5s
[CV] C=0.1, gamma=auto ...............................................
[CV]  C=0.1, gamma=auto, score=(train=0.982, test=0.969), total=  15.8s
[CV] C=0.1, gamma=auto ...............................................
[CV]  C=0.1, gamma=auto, score=(train=0.981, test=0.977), total=  10.3s
[CV] C=0.1, gamma=auto ...............................................
[CV]  C=0.1, gamma=auto, score=(train=0.987, test=0.896), total=  10.0s
[CV] C=0.1, gamma=1e-05 ..............................................
[CV]  C=0.1, gamma=1e-05, score=(train=0.126, test=0.126), total= 1.6min
[CV] C=0.1, gamma=1e-05 ............................................

[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed: 278.8min finished


GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [0.1, 1, 100, 1000],
                         'gamma': ['scale', 'auto', 1e-05, 0.0001, 0.001, 0.01,
                                   0.1, 1, 5, 10]},
             return_train_score=True, scoring='accuracy', verbose=5)

In [23]:
X_train=np.concatenate((X_train, X_validate), axis=0)

In [24]:
Y_train=np.concatenate((Y_train, Y_validate), axis=0)

In [15]:
grid_search.best_score_

0.9794699596957175

In [16]:
grid_search.best_params_

{'C': 1000, 'gamma': 'scale'}

In [41]:
best_svm_clf = SVC(kernel = 'rbf', C =1000, gamma= 'scale', class_weight = 'balanced', random_state = 0)

In [42]:
best_svm_clf.fit(X_train, Y_train)

SVC(C=1000, class_weight='balanced', random_state=0)

In [50]:
Y_predict = best_svm_clf.predict(X_test)

In [51]:
accuracy_score(Y_test, Y_predict)

0.9725852272727272

In [52]:
accuracy_score(Y_test, Y_predict, normalize=False)

6847