# Spambase Classifier
Spambase dataset is available from the UCI Machine Learning Repository:
http://archive.ics.uci.edu/ml/datasets/Spambase

In [42]:
import csv
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from tabulate import tabulate

In [55]:
#Read data from a csv to a dataframe
df = pd.read_csv("data/spambase.data")
data = df.values.tolist()
#Split the data into features and labels
X = np.array([x[:-1] for x in data]).astype(np.float)
#The last column hahs the labels
y = np.array([x[-1] for x in data]).astype(np.float)
#Split the data into k folds(4 in this case)
#It is really important to shuffle the data :p
kf = KFold(n_splits=4, shuffle=True)
fold = 0
# Choice of classifier (See the next cell)
# clf = GaussianNB()
# clf = SVC(gamma='auto')
clf = BernoulliNB(alpha=1.0, binarize=0.25)
foldScores = []
acc = []
err = []
# Train and validate a classifier for each of the k folds
for train_index, test_index in kf.split(X):
    fold+=1
    print("K fold Cross validation for fold: ", fold)
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
#     print("Number of training examples: ", len(X_train))
#     print("Number of testing examples: ", len(X_test))
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
#     print("True positives: ", tp)
#     print("False positives: ", fp)
#     print("True negatives: ", tn)
#     print("False negatives: ", fn)
    score = clf.score(X_test, y_test)
    acc.append(score)
    error = 1 - score
    err.append(error)
#     print("Accuracy (%): ", score*100) # (true_positive + true_negatives) / total_examples
    foldScores.append({"True positives":tp, "False positives":fp, "True negatives":tn, 
                       "False negatives":fn, "Accuracy(%)": score*100, "Error(%)": error*100})
results = pd.DataFrame(foldScores)
print(tabulate(results,headers='keys', tablefmt='psql'))
print("Average accuracy: ", np.mean(acc))
print("Average Error:", np.mean(err))

K fold Cross validation for fold:  1
TRAIN: [   0    1    3 ... 4596 4598 4599] TEST: [   2    6   18 ... 4589 4594 4597]
K fold Cross validation for fold:  2
TRAIN: [   0    1    2 ... 4596 4597 4598] TEST: [   3    8   11 ... 4590 4591 4599]
K fold Cross validation for fold:  3
TRAIN: [   2    3    6 ... 4596 4597 4599] TEST: [   0    1    4 ... 4584 4595 4598]
K fold Cross validation for fold:  4
TRAIN: [   0    1    2 ... 4597 4598 4599] TEST: [  10   12   17 ... 4592 4593 4596]
+----+---------------+------------+-------------------+-------------------+------------------+------------------+
|    |   Accuracy(%) |   Error(%) |   False negatives |   False positives |   True negatives |   True positives |
|----+---------------+------------+-------------------+-------------------+------------------+------------------|
|  0 |       90.6957 |    9.30435 |                56 |                51 |              656 |              387 |
|  1 |       88.8696 |   11.1304  |                73 | 

'''
# Choice of classifier
Various research papers show the comparison of using Naive Bayes models for such tasks.
Metsis et al. "Spam Filtering with Naive Bayes – Which Naive Bayes? compare multiple Naive Bayes models
for spambase filtering and demonstrate that the binary multinomial model yields better results 
than the Bernoulli model. However, Bernoulli seems to work atleast at par.
'''