### 1. Import necessary libraries

In [30]:
import numpy as np
import random
import pandas

from sklearn.model_selection import KFold
from sklearn import preprocessing

from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

### 2. Load data from dataset

In [31]:
f = open("spambase/spambase.data","r")
lines = [line for line in f]
random.shuffle(lines)
labels = []
features = []
for line in lines:
    line = line.strip().split(',')
    temp = [float(line[x]) for x in range(0,57)]
    features.append(temp)
    labels.append(int(line[57]))

### 3. Initialize classifiers
There are 8 classification models have been chosen in this task. 
1. Neural Network classifier
2. Support Vector Machine
3. Logistic Regression classifier
4. Naive Bayes classifier
5. K Nearest Neighbor classifier
6. Decision Tree
7. Random Forest
8. Gradient Boosting classifier

In [49]:
mlp = MLPClassifier(solver='lbfgs', activation='logistic')
svm_clf = svm.SVC()
lr_clf = LogisticRegression(C=1.0, solver='lbfgs', multi_class='multinomial', max_iter=10000)
naive_bayes = MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True)
knn = KNeighborsClassifier(n_neighbors=2)
decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier()
gdbt = GradientBoostingClassifier()

Initialize lists to store the result. The scores lists are used to store the corss-validation scores of each model.

In [50]:
mlp_scores = []
svm_scores = []
lr_scores = []
nb_scores = []
knn_scores = []
decision_tree_scores = []
random_forest_scores = []
gdbt_scores = []

### 4. Perform k-fold cross-validation with k=10
For each fold, we first get the train data and test data, then preprocess(standardize) those data for neural network classifier, SVM, logistic regressin classifier and knn. Then calculate the cross-validatin score for each model.

In [46]:
k_fold = KFold(n_splits=10)
for train_indices, test_indices in k_fold.split(features):
    x_train = [features[i] for i in train_indices]
    x_test = [features[i] for i in test_indices]
    y_train = [labels[i] for i in train_indices]
    y_test = [labels[i] for i in test_indices]

    scaler = preprocessing.StandardScaler().fit(x_train)
    x_train_transformed = scaler.transform(x_train)
    x_test_transformed = scaler.transform(x_test)
    train_data_transformed = np.array(x_train_transformed)
    test_data_transformed = np.array(x_test_transformed)

    mlp_scores.append(mlp.fit(train_data_transformed, y_train).score(test_data_transformed, y_test))
    svm_scores.append(svm_clf.fit(train_data_transformed, y_train).score(test_data_transformed, y_test))
    lr_scores.append(lr_clf.fit(train_data_transformed, y_train).score(test_data_transformed, y_test))
    knn_scores.append(knn.fit(train_data_transformed, y_train).score(test_data_transformed, y_test))

    nb_scores.append(naive_bayes.fit(x_train, y_train).score(x_test, y_test))
    decision_tree_scores.append(decision_tree.fit(x_train, y_train).score(x_test, y_test))
    gdbt_scores.append(gdbt.fit(x_train, y_train).score(x_test, y_test))
    random_forest_scores.append(random_forest.fit(x_train, y_train).score(x_test, y_test))

In [47]:
print("The average cross-validation score of each model is:")
print("mlp: "+str(np.array(mlp_scores).mean()))
print("svm_clf: "+str(np.array(svm_scores).mean()))
print("lr_clf: "+str(np.array(lr_scores).mean()))
print("naive_bayes: "+str(np.array(nb_scores).mean()))
print("knn: "+str(np.array(knn_scores).mean()))
print("decision_tree: "+str(np.array(decision_tree_scores).mean()))
print("random_forest: "+str(np.array(random_forest_scores).mean()))
print("gdbt: "+str(np.array(gdbt_scores).mean()))

The average cross-validation score of each model is:
mlp: 0.9333642365368291
svm_clf: 0.9343629161558047
lr_clf: 0.9258884278034517
naive_bayes: 0.7928642836932943
knn: 0.8991483542393661
decision_tree: 0.9185818164670375
random_forest: 0.9544868433462228
gdbt: 0.9459680279166275


From above result, we can see that the Random Forest Classifier has highest average cross-validation score, so I choose Random Forest to report the false positive, false negative and overall error rates.

In [51]:
report=[]
k_fold = KFold(n_splits=10)
for train_indices, test_indices in k_fold.split(features):
    x_train = [features[i] for i in train_indices]
    x_test = [features[i] for i in test_indices]
    y_train = [labels[i] for i in train_indices]
    y_test = [labels[i] for i in test_indices]
    
    random_forest.fit(x_train, y_train)
    random_forest_predict = random_forest.predict(x_test)
    FP = FN = TP = TN = Err = 0
    for i in range(len(random_forest_predict)):
        if y_test[i] == 0 and random_forest_predict[i]==1:
            FP += 1
        if y_test[i] == 1 and random_forest_predict[i]==0:
            FN += 1
        if y_test[i] == random_forest_predict[i] == 1:
            TP += 1
        if y_test[i] == random_forest_predict[i] == 0:
            TN += 1
        if y_test[i] != random_forest_predict[i]:
            Err += 1
    report.append([FP/(FP + TN), FN/(TP + FN), Err/len(random_forest_predict)])
    
report.append(np.average(report, axis=0))
headers1 = ["Fold1","Fold2","Fold3","Fold4","Fold5","Fold6","Fold7","Fold8","Fold9","Fold10","Avg Error Rate"]
headers2 = ["False Positive Rate", "False Negative Rate", "Overall Error Rate"]
print(pandas.DataFrame(report, headers1, headers2))

                False Positive Rate  False Negative Rate  Overall Error Rate
Fold1                      0.027119             0.054217            0.036876
Fold2                      0.020833             0.052326            0.032609
Fold3                      0.025830             0.105820            0.058696
Fold4                      0.029520             0.079365            0.050000
Fold5                      0.030612             0.036145            0.032609
Fold6                      0.028269             0.107345            0.058696
Fold7                      0.028269             0.067797            0.043478
Fold8                      0.011029             0.069149            0.034783
Fold9                      0.018519             0.094737            0.050000
Fold10                     0.042146             0.065327            0.052174
Avg Error Rate             0.026215             0.073223            0.044992
