In [6]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import Normalizer, MinMaxScaler, StandardScaler
from sklearn.svm import SVC 
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# use seaborn plotting defaults
import seaborn as sns; sns.set()


features = pd.read_csv("../data/new_labeled_features.csv",index_col=0)
tfeatures = features.T

(113,)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(features.iloc[:,:-1], features.label, test_size=0.5, random_state=0)


print('Grid Search for SVM')
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}]
clf = GridSearchCV(SVC(), tuned_parameters, cv=10)
%time clf.fit(X_train, Y_train)
#print(clf.best_params_)
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))
print()
print()
y_true, y_pred = Y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))


print()
print()
print('Grid Search for kNN')
tuned_parameters = [{'n_neighbors': [i for i in range(1,30)]}]
clf = GridSearchCV(KNeighborsClassifier(), tuned_parameters, cv=10)
%time clf.fit(X_train, Y_train)
#print(clf.best_params_)
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))
print()
print()
y_true, y_pred = Y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))


print()
print()
print('Grid Search for logistic regression')
tuned_parameters = [{'C': [1,10,100,1000], 'tol': [1e-5,1e-4,1e-3]}]
clf = GridSearchCV(LogisticRegression(), tuned_parameters, cv=10)
%time clf.fit(X_train, Y_train)
#print(clf.best_params_)
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))
print()
print()
y_true, y_pred = Y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))

print()
print()
print('Grid Search for random forest')
tuned_parameters = [{'n_estimators': [10, 50, 100, 150, 200]}]
clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=10)
%time clf.fit(X_train, Y_train)
#print(clf.best_params_)
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))
print()
print()
y_true, y_pred = Y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))

In [None]:
file = open("../reports/parameter_tuning", 'a')

X_train, X_test, Y_train, Y_test = train_test_split(features.iloc[:,:-1], features.label, test_size=0.5, random_state=0)


file.write('Grid Search for SVM\n')
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}]
clf = GridSearchCV(SVC(), tuned_parameters, cv=10)
%time clf.fit(X_train, Y_train)
#print(clf.best_params_)
file.write('\n')
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    file.write("%0.3f (+/-%0.03f) for %r\n"% (mean, std * 2, params))
file.write('\n')
file.write('\n')
y_true, y_pred = Y_test, clf.predict(X_test)
file.write(classification_report(y_true, y_pred))

file.write('\n')
file.write('\n')
file.write('\n')
file.write('Grid Search for kNN\n')
tuned_parameters = [{'n_neighbors': [i for i in range(1,30)]}]
clf = GridSearchCV(KNeighborsClassifier(), tuned_parameters, cv=10)
%time clf.fit(X_train, Y_train)
#print(clf.best_params_)
file.write('\n')
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    file.write("%0.3f (+/-%0.03f) for %r\n"% (mean, std * 2, params))
file.write('\n')
file.write('\n')
y_true, y_pred = Y_test, clf.predict(X_test)
file.write(classification_report(y_true, y_pred))

file.write('\n')
file.write('\n')
file.write('\n')
file.write('Grid Search for logistic regression\n')
tuned_parameters = [{'C': [1,10,100,1000], 'tol': [1e-5,1e-4,1e-3]}]
clf = GridSearchCV(LogisticRegression(), tuned_parameters, cv=10)
%time clf.fit(X_train, Y_train)
#print(clf.best_params_)
file.write('\n')
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    file.write("%0.3f (+/-%0.03f) for %r\n"% (mean, std * 2, params))
file.write('\n')
file.write('\n')
y_true, y_pred = Y_test, clf.predict(X_test)
file.writelines(classification_report(y_true, y_pred))

file.write('\n')
file.write('\n')
file.write('\n')
file.write('Grid Search for random forest\n')
tuned_parameters = [{'n_estimators': [10, 50, 100, 150, 200]}]
clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=10)
%time clf.fit(X_train, Y_train)
#print(clf.best_params_)
file.write('\n')
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    file.write("%0.3f (+/-%0.03f) for %r\n"% (mean, std * 2, params))
file.write('\n')
file.write('\n')
y_true, y_pred = Y_test, clf.predict(X_test)
file.writelines(classification_report(y_true, y_pred))
file.close()

In [None]:
file = open("../reports/parameter_tuning", 'w')

X_train, X_test, Y_train, Y_test = train_test_split(features.iloc[:,:-1], features.label, test_size=0.5, random_state=0)

estimators_list = [SVC(),KNeighborsClassifier(), LogisticRegression(), RandomForestClassifier()]
tuned_parameters_list = [[{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}],
                         [{'n_neighbors': [i for i in range(1,30)]}], 
                         [{'C': [1,10,100,1000], 'tol': [1e-5,1e-4,1e-3]}], 
                         [{'n_estimators': [10, 50, 100, 150, 200]}]]
names_list = ["SVM", "KNN", "LR", "RF"]

for estimator, tuned_parameters, name in zip(estimators_list, tuned_parameters_list, names_list):
    file.write(30*"**"+"\n")
    file.write('Grid Search for '+ name +'\n')
    clf = GridSearchCV(estimator, tuned_parameters, cv=10)
    %time clf.fit(X_train, Y_train)
    print(clf.best_params_)
    file.write('\n')
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        file.write("%0.3f (+/-%0.03f) for %r\n"% (mean, std * 2, params))
    file.write('\n')
    file.write('\n')
    y_true, y_pred = Y_test, clf.predict(X_test)
    file.write(classification_report(y_true, y_pred))
    file.write('\n')
    file.write('\n')
    file.write('\n')
file.close()

In [12]:
#construction of the lists
estimators_list = [SVC(),KNeighborsClassifier(), LogisticRegression(), RandomForestClassifier()]
tuned_parameters_list = [[{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}],
                         [{'n_neighbors': [i for i in range(1,30)]}], 
                         [{'C': [1,10,100,1000], 'tol': [1e-5,1e-4,1e-3]}], 
                         [{'n_estimators': [10, 50, 100, 150, 200]}]]
norm_list = [Normalizer(), StandardScaler(), MinMaxScaler()]
names_list = ["SVM", "KNN", "LR", "RF"]
transforms_list = ["norm","scale","minmax"]


#breaking the data
labels = features.label

#normalization of the data
tnfeatures = norm_list[0].transform(tfeatures.iloc[:-1,:])
nfeatures = tnfeatures.T
pfeatures = pd.concat([pd.DataFrame(data=nfeatures), labels], axis=1)
print(pfeatures.iloc[:, -1])

X_train, X_test, Y_train, Y_test = train_test_split(pfeatures.iloc[:, :-1], pfeatures['label'], test_size=0.5, random_state=0)
file = open("../reports/parameter_tuning_"+transforms_list[0], 'w')

for estimator, tuned_parameters, name in zip(estimators_list, tuned_parameters_list, names_list):
    file.write(30*"**"+"\n")
    file.write('Grid Search for '+ name +'\n')
    clf = GridSearchCV(estimator, tuned_parameters, cv=10)
    %time clf.fit(X_train, Y_train)
    file.write(str(clf.best_params_))
    file.write('\n')
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        file.write("%0.3f (+/-%0.03f) for %r\n"% (mean, std * 2, params))
    file.write('\n')
    file.write('\n')
    y_true, y_pred = Y_test, clf.predict(X_test)
    file.write(classification_report(y_true, y_pred))
    file.write('\n')
    file.write('\n')
    file.write('\n')
file.close()

#scaling of the data
for norm, trans in zip(norm_list[1:], transforms_list[1:]):
    print()
    print()
    print()
    nfeatures = norm.fit_transform(features.iloc[:, :-1])
    ###############################################################
    pfeatures = pd.concat([pd.DataFrame(data=nfeatures), labels], axis=1)
    print(pfeatures.iloc[:, -1])
    ###############################################################
    X_train, X_test, Y_train, Y_test = train_test_split(pfeatures.iloc[:, :-1], pfeatures['label'], test_size=0.5, random_state=0)
    file = open("../reports/parameter_tuning_"+trans, 'w')

    for estimator, tuned_parameters, name in zip(estimators_list, tuned_parameters_list, names_list):
        file.write(30*"**"+"\n")
        file.write('Grid Search for '+ name +'\n')
        clf = GridSearchCV(estimator, tuned_parameters, cv=10)
        %time clf.fit(X_train, Y_train)
        file.write((clf.best_params_))
        file.write('\n')
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            file.write("%0.3f (+/-%0.03f) for %r\n"% (mean, std * 2, params))
        file.write('\n')
        file.write('\n')
        y_true, y_pred = Y_test, clf.predict(X_test)
        file.write(classification_report(y_true, y_pred))
        file.write('\n')
        file.write('\n')
        file.write('\n')
    file.close()


0        True
1        True
2        True
3        True
4       False
5        True
6       False
7       False
8       False
9       False
10      False
11      False
12      False
13      False
14       True
15       True
16      False
17      False
18      False
19      False
20       True
21      False
22      False
23      False
24      False
25      False
26      False
27      False
28      False
29      False
        ...  
4116    False
4117    False
4118    False
4119    False
4120    False
4121    False
4122    False
4123    False
4124    False
4125    False
4126    False
4127    False
4128    False
4129    False
4130    False
4131    False
4132    False
4133    False
4134    False
4135    False
4136    False
4137    False
4138    False
4139    False
4140    False
4141    False
4142    False
4143    False
4144    False
4145     True
Name: label, Length: 4146, dtype: bool
CPU times: user 31.2 s, sys: 0 ns, total: 31.2 s
Wall time: 31 s
CPU times: user 2min 35s, sys: 1.08 s, tot

TypeError: write() argument must be str, not dict

In [None]:
estimators_list = [SVC(),KNeighborsClassifier(), LogisticRegression(), RandomForestClassifier()]
tuned_parameters_list = [[{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}],
                         [{'n_neighbors': [i for i in range(1,30)]}], 
                         [{'C': [1,10,100,1000], 'tol': [1e-5,1e-4,1e-3]}], 
                         [{'n_estimators': [10, 50, 100, 150, 200]}]]
norm_list = [Normalizer(), StandardScaler(), MinMaxScaler()]
names_list = ["SVM", "KNN", "LR", "RF"]
transforms_list = ["norm","scale","minmax"]

tnfeatures = norm_list[0].transform(tfeatures)
nfeatures = tnfeatures.T
X_train, X_test, Y_train, Y_test = train_test_split(nfeatures[:,:-1], nfeatures[:, -1], test_size=0.5, random_state=0)
for estimator, tuned_parameters, name in zip(estimators_list, tuned_parameters_list, names_list):
    print(estimator)
    print(tuned_parameters)
    print(name)
    print()
    print()