In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold

In [4]:
#Reading train data
train_df = pd.read_csv('CE802_Ass_2018_Data.csv')
train_df.head()
labels = np.array(train_df['Class'])
features = train_df.drop("Class", axis = 1)
features = np.array(features)

In [5]:
#Reading test data
test_df = pd.read_csv('CE802_Ass_2018_Test.csv')
test_df_features = test_df.drop('Class', axis = 1)
#test_df_features = np.array(test_features)

In [6]:
#Splitting the data for the evaluation of models
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 77)

In [7]:
#### Evaluation of different modules ###
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
def evaluation(clf, test_features, test_labels):
    preds = clf.predict(test_features)
    conf_mat_RF = pd.crosstab(test_labels, preds, rownames=['Actual Class'], colnames=['Predicted Class'])
    print(conf_mat_RF)
    Accuracy_Score = accuracy_score(test_labels, preds)
    Precision_Score = precision_score(test_labels, preds,  average="macro")
    Recall_Score = recall_score(test_labels, preds,  average="macro")
    F1_Score = f1_score(test_labels, preds,  average="macro")
    print('Average Accuracy: %0.2f +/- (%0.1f) %%' % (Accuracy_Score.mean()*100, Accuracy_Score.std()*100))
    print('Average Precision: %0.2f +/- (%0.1f) %%' % (Precision_Score.mean()*100, Precision_Score.std()*100))
    print('Average Recall: %0.2f +/- (%0.1f) %%' % (Recall_Score.mean()*100, Recall_Score.std()*100))
    print('Average F1-Score: %0.2f +/- (%0.1f) %%' % (F1_Score.mean()*100, F1_Score.std()*100))

In [None]:
from sklearn import svm
clf = svm.SVC()

Cs = np.logspace(-1, 3, 9)
Gs = np.logspace(-7, -0, 8)
k_fold = KFold(n_splits=10)
clf = GridSearchCV(estimator=clf, param_grid=dict(C=Cs, gamma=Gs), n_jobs=-1)
score = []
for train_indices, test_indices in k_fold.split(features):
    clf.fit(features[train_indices], labels[train_indices])
    score.append(clf.score(features[test_indices], labels[test_indices]))
print('Average accuracy:', np.mean(score))
print(clf.best_params_)




In [66]:
clf = svm.SVC(C = 1000.0, gamma = 0.0001)

In [67]:
clf.fit(train_features, train_labels)

SVC(C=1000.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [68]:
evaluation(clf, test_features, test_labels)

Predicted Class  False  True 
Actual Class                 
False              169     29
True                37    140
Average Accuracy: 82.40 +/- (0.0) %
Average Precision: 82.44 +/- (0.0) %
Average Recall: 82.22 +/- (0.0) %
Average F1-Score: 82.29 +/- (0.0) %


In [69]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
parm_grid = {"criterion": ["gini","entropy"],
              "min_samples_split": [2, 10, 20],
              "max_depth": [2, 5, 10],
              "min_samples_leaf": [1, 5, 10],
              "max_leaf_nodes": [None, 5, 10, 20],
              }

clf = GridSearchCV(estimator = clf, param_grid = parm_grid, n_jobs = -1)
k_fold = KFold(n_splits = 10)
score = []
for train_indices, test_indices in k_fold.split(features):
    clf.fit(features[train_indices], labels[train_indices])
    score.append(clf.score(features[test_indices], labels[test_indices]))
print('Average accuracy:', np.mean(score), )
print(clf.best_params_)



Average accuracy: 0.7606666666666667
{'criterion': 'gini', 'max_depth': 10, 'max_leaf_nodes': None, 'min_samples_leaf': 10, 'min_samples_split': 2}


In [70]:
clf = DecisionTreeClassifier(criterion = "gini", max_depth = 10, max_leaf_nodes = None, min_samples_leaf = 10, min_samples_split = 2)

In [71]:
clf.fit(train_features, train_labels)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [72]:
evaluation(clf, test_features, test_labels)

Predicted Class  False  True 
Actual Class                 
False              158     40
True                39    138
Average Accuracy: 78.93 +/- (0.0) %
Average Precision: 78.87 +/- (0.0) %
Average Recall: 78.88 +/- (0.0) %
Average F1-Score: 78.87 +/- (0.0) %


In [73]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
scaler = StandardScaler()
  
scaler.fit(train_features)

train_features_norm = scaler.transform(train_features)  
test_features_norm = scaler.transform(test_features)

k_range = list(range(1, 40))
parm_grid = dict(n_neighbors = k_range)
clf = KNeighborsClassifier()
k_fold = KFold(n_splits=10)
clf = GridSearchCV(estimator=clf, param_grid= parm_grid, n_jobs=-1)
score = []
for train_indices, test_indices in k_fold.split(features):
    clf.fit(features[train_indices], labels[train_indices])
    score.append(clf.score(features[test_indices], labels[test_indices]))
print('Average accuracy:', np.mean(score))
print(clf.best_params_)



Average accuracy: 0.652
{'n_neighbors': 37}


In [74]:
clf = KNeighborsClassifier(n_neighbors = 37)

In [75]:
clf.fit(train_features, train_labels)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=37, p=2,
           weights='uniform')

In [76]:
evaluation(clf, test_features, test_labels)

Predicted Class  False  True 
Actual Class                 
False              122     76
True                72    105
Average Accuracy: 60.53 +/- (0.0) %
Average Precision: 60.45 +/- (0.0) %
Average Recall: 60.47 +/- (0.0) %
Average F1-Score: 60.45 +/- (0.0) %


In [85]:
clf = svm.SVC(C = 1000.0, gamma = 0.0001)
clf.fit(train_features, train_labels)
final_labels = clf.predict(test_df_features)
#final_labels.head()
test_df_features['Class'] = final_labels

In [86]:
test_df_features.head()

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,Class
0,-0.53,-0.79,0.29,-0.64,2.08,-1.11,10.0,6.1,5.2,155.0,215.0,15.0,11.2,85.0,False
1,3.69,2.63,2.93,3.48,2.89,0.48,1.0,0.04,0.25,6.0,5.0,4.0,1.4,16.0,True
2,-2.63,2.74,-0.63,-0.16,-1.94,0.45,10.0,7.4,7.7,130.0,110.0,5.0,6.25,-20.0,False
3,-2.06,-2.67,0.83,-2.87,-0.91,1.48,10.0,6.65,9.2,50.0,320.0,5.0,12.0,-60.0,False
4,3.28,-0.61,-0.75,-0.68,-0.09,3.32,10.0,6.5,5.65,230.0,30.0,15.0,8.2,-5.0,False


In [88]:
test_df_features.to_csv('final.csv')