In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# models 
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

In [3]:
# data processing 
columNames = [str(i) for i in range(1,59)]
data = pd.read_csv("./spambase.data", names=columNames).values
X, y = data.T[:-1].T, data.T[-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
print("training set: ", X_train.shape, y_train.shape)
print("test set: ", X_test.shape, y_test.shape)


training set:  (3450, 57) (3450,)
test set:  (1151, 57) (1151,)


In [4]:
models = []
models.append(('Perceptron', Perceptron())) 
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('SVM', svm.SVC()))
models.append(('AdBst', AdaBoostClassifier()))
models.append(('NB', GaussianNB()))

In [5]:
# models comparison 
scoring = ['accuracy', 'precision', 'recall', 'f1_weighted']
Chart_col = ['model_name', 'test_accuracy', 'test_f1_weighted', 'precision', 'recall']
Chart = pd.DataFrame(columns=Chart_col)
rowNum = 0
for name, model in models: 
    Chart.loc[rowNum, 'model_name'] = name
    cv_results = cross_validate(model, X_train, y_train, scoring=scoring, cv=10)
    Chart.loc[rowNum, 'test_accuracy'] = cv_results['test_accuracy'].mean()
    Chart.loc[rowNum, 'test_f1_weighted'] = cv_results['test_f1_weighted'].mean()
    Chart.loc[rowNum, 'precision'] = cv_results['test_precision'].mean()
    Chart.loc[rowNum, 'recall'] = cv_results['test_recall'].mean()
    rowNum +=1
Chart.sort_values(by=['test_accuracy'],ascending = False, inplace = True)
Chart


Unnamed: 0,model_name,test_accuracy,test_f1_weighted,precision,recall
4,AdBst,0.942319,0.942244,0.931763,0.920937
2,CART,0.92087,0.920836,0.899968,0.898693
5,NB,0.828696,0.830461,0.708776,0.958584
1,KNN,0.791594,0.791367,0.73774,0.729515
3,SVM,0.714203,0.693306,0.736139,0.427239
0,Perceptron,0.692174,0.668493,0.646281,0.778312


In [8]:
clf = AdaBoostClassifier()
N_SPLITS = 10
kf = KFold(n_splits=N_SPLITS)
results = []
for train_index, test_index in kf.split(X_train):
    train_x, test_x = X_train[train_index], X_train[test_index]
    train_y, test_y = y_train[train_index], y_train[test_index]
    clf.fit(train_x, train_y)
    pred_y = clf.predict(test_x)
    conf_mat = confusion_matrix(test_y, pred_y)
    tn, fp, fn, tp = conf_mat.ravel()
    error_rate = fp+fn/(tn+fp+fn+tp)
    results.append((fp, fn, error_rate))


Chart2_col = ['fold#', 'false positive', 'false negative', 'overall error rate']
Chart2 = pd.DataFrame(columns=Chart2_col)
average = 0
for i, res in enumerate(results):
    Chart2.loc[i+1, 'fold#'] = i+1
    Chart2.loc[i+1, 'false positive'] = res[0]
    Chart2.loc[i+1, 'false negative'] = res[1]
    Chart2.loc[i+1, 'overall error rate'] = res[2]
    average = (i * average + res[2])/(i+1)
Chart2



Unnamed: 0,fold#,false positive,false negative,overall error rate
1,1,7,11,7.031884
2,2,9,13,9.037681
3,3,12,12,12.034783
4,4,5,15,5.043478
5,5,9,11,9.031884
6,6,10,13,10.037681
7,7,8,10,8.028986
8,8,8,9,8.026087
9,9,11,5,11.014493
10,10,7,7,7.02029


In [9]:
Chart3 = pd.DataFrame(columns=['average error rate'])
Chart3.loc[0, 'average error rate'] = average
Chart3

Unnamed: 0,average error rate
0,8.630725
