In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import Normalizer, MinMaxScaler, StandardScaler
from sklearn.svm import SVC 
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# use seaborn plotting defaults
import seaborn as sns; sns.set()


features = pd.read_csv("../data/new_labeled_features.csv",index_col=0)
tfeatures = features.T

  from numpy.core.umath_tests import inner1d


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(features.iloc[:,:-1], features.label, test_size=0.5, random_state=0)


print('Grid Search for SVM')
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}]
clf = GridSearchCV(SVC(), tuned_parameters, cv=10)
%time clf.fit(X_train, Y_train)
#print(clf.best_params_)
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))
print()
print()
y_true, y_pred = Y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))


print()
print()
print('Grid Search for kNN')
tuned_parameters = [{'n_neighbors': [i for i in range(1,30)]}]
clf = GridSearchCV(KNeighborsClassifier(), tuned_parameters, cv=10)
%time clf.fit(X_train, Y_train)
#print(clf.best_params_)
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))
print()
print()
y_true, y_pred = Y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))


print()
print()
print('Grid Search for logistic regression')
tuned_parameters = [{'C': [1,10,100,1000], 'tol': [1e-5,1e-4,1e-3]}]
clf = GridSearchCV(LogisticRegression(), tuned_parameters, cv=10)
%time clf.fit(X_train, Y_train)
#print(clf.best_params_)
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))
print()
print()
y_true, y_pred = Y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))

print()
print()
print('Grid Search for random forest')
tuned_parameters = [{'n_estimators': [10, 50, 100, 150, 200]}]
clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=10)
%time clf.fit(X_train, Y_train)
#print(clf.best_params_)
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))
print()
print()
y_true, y_pred = Y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))

In [None]:
file = open("../reports/parameter_tuning", 'a')

X_train, X_test, Y_train, Y_test = train_test_split(features.iloc[:,:-1], features.label, test_size=0.5, random_state=0)


file.write('Grid Search for SVM\n')
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}]
clf = GridSearchCV(SVC(), tuned_parameters, cv=10)
%time clf.fit(X_train, Y_train)
#print(clf.best_params_)
file.write('\n')
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    file.write("%0.3f (+/-%0.03f) for %r\n"% (mean, std * 2, params))
file.write('\n')
file.write('\n')
y_true, y_pred = Y_test, clf.predict(X_test)
file.write(classification_report(y_true, y_pred))

file.write('\n')
file.write('\n')
file.write('\n')
file.write('Grid Search for kNN\n')
tuned_parameters = [{'n_neighbors': [i for i in range(1,30)]}]
clf = GridSearchCV(KNeighborsClassifier(), tuned_parameters, cv=10)
%time clf.fit(X_train, Y_train)
#print(clf.best_params_)
file.write('\n')
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    file.write("%0.3f (+/-%0.03f) for %r\n"% (mean, std * 2, params))
file.write('\n')
file.write('\n')
y_true, y_pred = Y_test, clf.predict(X_test)
file.write(classification_report(y_true, y_pred))

file.write('\n')
file.write('\n')
file.write('\n')
file.write('Grid Search for logistic regression\n')
tuned_parameters = [{'C': [1,10,100,1000], 'tol': [1e-5,1e-4,1e-3]}]
clf = GridSearchCV(LogisticRegression(), tuned_parameters, cv=10)
%time clf.fit(X_train, Y_train)
#print(clf.best_params_)
file.write('\n')
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    file.write("%0.3f (+/-%0.03f) for %r\n"% (mean, std * 2, params))
file.write('\n')
file.write('\n')
y_true, y_pred = Y_test, clf.predict(X_test)
file.writelines(classification_report(y_true, y_pred))

file.write('\n')
file.write('\n')
file.write('\n')
file.write('Grid Search for random forest\n')
tuned_parameters = [{'n_estimators': [10, 50, 100, 150, 200]}]
clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=10)
%time clf.fit(X_train, Y_train)
#print(clf.best_params_)
file.write('\n')
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    file.write("%0.3f (+/-%0.03f) for %r\n"% (mean, std * 2, params))
file.write('\n')
file.write('\n')
y_true, y_pred = Y_test, clf.predict(X_test)
file.writelines(classification_report(y_true, y_pred))
file.close()

In [5]:
file = open("../reports/parameter_tuning", 'w')

X_train, X_test, Y_train, Y_test = train_test_split(features.iloc[:,:-1], features.label, test_size=0.5, random_state=0)

estimators_list = [SVC(),KNeighborsClassifier(), LogisticRegression(), RandomForestClassifier()]
tuned_parameters_list = [[{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}],
                         [{'n_neighbors': [i for i in range(1,30)]}], 
                         [{'C': [1,10,100,1000], 'tol': [1e-5,1e-4,1e-3]}], 
                         [{'n_estimators': [10, 50, 100, 150, 200]}]]
names_list = ["SVM", "KNN", "LR", "RF"]

for estimator, tuned_parameters, name in zip(estimators_list, tuned_parameters_list, names_list):
    file.write(30*"**"+"\n")
    file.write('Grid Search for '+ name +'\n')
    clf = GridSearchCV(estimator, tuned_parameters, cv=10)
    %time clf.fit(X_train, Y_train)
    file.write(str(clf.best_params_))
    file.write('\n')
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        file.write("%0.3f (+/-%0.03f) for %r\n"% (mean, std * 2, params))
    file.write('\nmean accuracy score:'+str(means.sum()/len(means)))
    file.write('\n')
    file.write('\n')
    y_true, y_pred = Y_test, clf.predict(X_test)
    file.write(classification_report(y_true, y_pred))
    file.write('\n')
    file.write('\n')
    file.write('\n')
file.close()

CPU times: user 38.9 s, sys: 4 ms, total: 38.9 s
Wall time: 38.9 s
CPU times: user 1min 24s, sys: 1.12 s, total: 1min 25s
Wall time: 48.8 s
CPU times: user 45.5 s, sys: 916 ms, total: 46.4 s
Wall time: 27.5 s
CPU times: user 58.9 s, sys: 20 ms, total: 58.9 s
Wall time: 58.8 s


In [2]:
#construction of the lists
estimators_list = [SVC(),KNeighborsClassifier(), LogisticRegression(), RandomForestClassifier()]
tuned_parameters_list = [[{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}],
                         [{'n_neighbors': [i for i in range(1,30)]}], 
                         [{'C': [1,10,100,1000], 'tol': [1e-5,1e-4,1e-3]}], 
                         [{'n_estimators': [10, 50, 100, 150, 200]}]]
norm_list = [Normalizer(), StandardScaler(), MinMaxScaler()]
names_list = ["SVM", "KNN", "LR", "RF"]
transforms_list = ["norm","scale","minmax"]


#breaking the data
labels = features.label

#normalization of the data
tnfeatures = norm_list[0].transform(tfeatures.iloc[:-1,:])
nfeatures = tnfeatures.T
pfeatures = pd.concat([pd.DataFrame(data=nfeatures), labels], axis=1)

X_train, X_test, Y_train, Y_test = train_test_split(pfeatures.iloc[:, :-1], pfeatures['label'], test_size=0.5, random_state=0)
file = open("../reports/parameter_tuning_"+transforms_list[0], 'w')

for estimator, tuned_parameters, name in zip(estimators_list, tuned_parameters_list, names_list):
    file.write(30*"**"+"\n")
    file.write('Grid Search for '+ name +'\n')
    clf = GridSearchCV(estimator, tuned_parameters, cv=10)
    %time clf.fit(X_train, Y_train)
    file.write(str(clf.best_params_))
    file.write(str(clf.best_estimator_))
    file.write('\n')
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        file.write("%0.3f (+/-%0.03f) for %r\n"% (mean, std * 2, params))
    file.write('\nmean accuracy score:'+str(means.sum()/len(means)))
    file.write('\n')
    file.write('\n')
    y_true, y_pred = Y_test, clf.predict(X_test)
    file.write(classification_report(y_true, y_pred))
    file.write('\n')
    file.write('\n')
    file.write('\n')
file.close()

#scaling of the data
for norm, trans in zip(norm_list[1:], transforms_list[1:]):
    print()
    print()
    print()
    nfeatures = norm.fit_transform(features.iloc[:, :-1])
    ###############################################################
    pfeatures = pd.concat([pd.DataFrame(data=nfeatures), labels], axis=1)
    ###############################################################
    X_train, X_test, Y_train, Y_test = train_test_split(pfeatures.iloc[:, :-1], pfeatures['label'], test_size=0.5, random_state=0)
    file = open("../reports/parameter_tuning_"+trans, 'w')

    for estimator, tuned_parameters, name in zip(estimators_list, tuned_parameters_list, names_list):
        file.write(30*"**"+"\n")
        file.write('Grid Search for '+ name +'\n')
        clf = GridSearchCV(estimator, tuned_parameters, cv=10)
        %time clf.fit(X_train, Y_train)
        file.write(str(clf.best_params_))
        file.write(str(clf.best_estimator_))
        file.write('\n')
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            file.write("%0.3f (+/-%0.03f) for %r\n"% (mean, std * 2, params))
        file.write('\nmean accuracy score:'+str(means.sum()/len(means)))
        file.write('\n')
        file.write('\n')
        y_true, y_pred = Y_test, clf.predict(X_test)
        file.write(classification_report(y_true, y_pred))
        file.write('\n')
        file.write('\n')
        file.write('\n')
    file.close()


CPU times: user 33.9 s, sys: 20 ms, total: 33.9 s
Wall time: 33.8 s
CPU times: user 2min 55s, sys: 1.29 s, total: 2min 56s
Wall time: 2min 16s
CPU times: user 16.9 s, sys: 352 ms, total: 17.3 s
Wall time: 8.84 s
CPU times: user 1min 5s, sys: 16 ms, total: 1min 5s
Wall time: 1min 5s



CPU times: user 34.3 s, sys: 12 ms, total: 34.3 s
Wall time: 34.2 s
CPU times: user 2min 47s, sys: 1.29 s, total: 2min 48s
Wall time: 2min 8s
CPU times: user 1min 41s, sys: 808 ms, total: 1min 42s
Wall time: 1min 20s
CPU times: user 1min 8s, sys: 12 ms, total: 1min 8s
Wall time: 1min 8s



CPU times: user 33.4 s, sys: 0 ns, total: 33.4 s
Wall time: 33.3 s
CPU times: user 1min 49s, sys: 1.22 s, total: 1min 51s
Wall time: 1min 10s
CPU times: user 25.6 s, sys: 492 ms, total: 26.1 s
Wall time: 13.7 s
CPU times: user 1min 4s, sys: 8 ms, total: 1min 4s
Wall time: 1min 4s


In [7]:
#construction of the lists
estimators_list = [SVC(),KNeighborsClassifier(), LogisticRegression(), RandomForestClassifier()]
tuned_parameters_list = [[{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}],
                         [{'n_neighbors': [i for i in range(1,30)]}], 
                         [{'C': [1,10,100,1000], 'tol': [1e-5,1e-4,1e-3]}], 
                         [{'n_estimators': [10, 50, 100, 150, 200]}]]
norm_list = [Normalizer(), StandardScaler(), MinMaxScaler()]
names_list = ["SVM", "KNN", "LR", "RF"]
transforms_list = ["norm","scale","minmax"]


#breaking the data
labels = features.label

#normalization of the data
tnfeatures = norm_list[0].transform(tfeatures.iloc[:-1,:])
nfeatures = tnfeatures.T
pfeatures = pd.concat([pd.DataFrame(data=nfeatures), labels], axis=1)

X_train, X_test, Y_train, Y_test = train_test_split(pfeatures.iloc[:, :-1], pfeatures['label'], test_size=0.5, random_state=0)
file = open("../reports/nparameter_tuning_"+transforms_list[0], 'w')

for estimator, tuned_parameters, name in zip(estimators_list, tuned_parameters_list, names_list):
    file.write(30*"**"+"\n")
    file.write('Grid Search for '+ name +'\n')
    clf = GridSearchCV(estimator, tuned_parameters, cv=10, scoring='roc_auc')
    %time clf.fit(X_train, Y_train)
    file.write(str(clf.best_params_))
    file.write('\n')
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        file.write("%0.3f (+/-%0.03f) for %r\n"% (mean, std * 2, params))
    file.write('\n')
    file.write('\n')
    y_true, y_pred = Y_test, clf.predict(X_test)
    file.write(classification_report(y_true, y_pred))
    file.write('\n')
    file.write('\n')
    file.write('\n')
file.close()

#scaling of the data
for norm, trans in zip(norm_list[1:], transforms_list[1:]):
    print()
    print()
    print()
    nfeatures = norm.fit_transform(features.iloc[:, :-1])
    ###############################################################
    pfeatures = pd.concat([pd.DataFrame(data=nfeatures), labels], axis=1)
    ###############################################################
    X_train, X_test, Y_train, Y_test = train_test_split(pfeatures.iloc[:, :-1], pfeatures['label'], test_size=0.5, random_state=0)
    file = open("../reports/nparameter_tuning_"+trans, 'w')

    for estimator, tuned_parameters, name in zip(estimators_list, tuned_parameters_list, names_list):
        file.write(30*"**"+"\n")
        file.write('Grid Search for '+ name +'\n')
        clf = GridSearchCV(estimator, tuned_parameters, cv=10, scoring='roc_auc')
        %time clf.fit(X_train, Y_train)
        file.write(str(clf.best_params_))
        file.write('\n')
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            file.write("%0.3f (+/-%0.03f) for %r\n"% (mean, std * 2, params))
        file.write('\n')
        file.write('\n')
        y_true, y_pred = Y_test, clf.predict(X_test)
        file.write(classification_report(y_true, y_pred))
        file.write('\n')
        file.write('\n')
        file.write('\n')
    file.close()

CPU times: user 33.9 s, sys: 44 ms, total: 33.9 s
Wall time: 33.9 s
CPU times: user 2min 44s, sys: 1.28 s, total: 2min 46s
Wall time: 2min 4s
CPU times: user 16.1 s, sys: 408 ms, total: 16.5 s
Wall time: 8.28 s
CPU times: user 1min, sys: 8 ms, total: 1min
Wall time: 1min



CPU times: user 31.3 s, sys: 0 ns, total: 31.3 s
Wall time: 31.1 s
CPU times: user 3min 41s, sys: 1.36 s, total: 3min 43s
Wall time: 3min 2s
CPU times: user 1min 41s, sys: 956 ms, total: 1min 42s
Wall time: 1min 20s
CPU times: user 1min 8s, sys: 36 ms, total: 1min 9s
Wall time: 1min 8s



CPU times: user 32 s, sys: 8 ms, total: 32 s
Wall time: 31.8 s
CPU times: user 1min 47s, sys: 1.13 s, total: 1min 48s
Wall time: 1min 6s
CPU times: user 25.7 s, sys: 504 ms, total: 26.2 s
Wall time: 13.6 s
CPU times: user 1min 2s, sys: 84 ms, total: 1min 2s
Wall time: 1min 2s


In [None]:
estimators_list = [SVC(),KNeighborsClassifier(), LogisticRegression(), RandomForestClassifier()]
tuned_parameters_list = [[{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}],
                         [{'n_neighbors': [i for i in range(1,30)]}], 
                         [{'C': [1,10,100,1000], 'tol': [1e-5,1e-4,1e-3]}], 
                         [{'n_estimators': [10, 50, 100, 150, 200]}]]
norm_list = [Normalizer(), StandardScaler(), MinMaxScaler()]
names_list = ["SVM", "KNN", "LR", "RF"]
transforms_list = ["norm","scale","minmax"]

tnfeatures = norm_list[0].transform(tfeatures)
nfeatures = tnfeatures.T
X_train, X_test, Y_train, Y_test = train_test_split(nfeatures[:,:-1], nfeatures[:, -1], test_size=0.5, random_state=0)
for estimator, tuned_parameters, name in zip(estimators_list, tuned_parameters_list, names_list):
    print(estimator)
    print(tuned_parameters)
    print(name)
    print()
    print()