In [1]:
# Spam Detection

# No of Folds
KF = 10

In [2]:
# Importing the libraries
import numpy as np
import pandas as pd

In [3]:
# Importing the dataset
dataset = pd.read_csv('spambase.data')
X = dataset.iloc[:, :57].values
y = dataset.iloc[:, 57].values

In [4]:
# Splitting the Dataset into K- Folds
from sklearn.model_selection import KFold
# Define the split - into KF folds and also shuffle the dataset as all the Spam emails and non-spam emails are in one chunk
kf = KFold(n_splits=KF,shuffle=True)

In [5]:
# Array C to store accuracy at each Fold for each model
C = np.zeros(KF)

In [6]:
# Importing the models
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [7]:
# Number of different models on which KFold will be run
n = 9

In [8]:
# Looping through each of the 18 models and performing KFold CV in each case
maxi = 0 # Max accuracy across all models
index = 0 # Name of model giving max accuracy
    
for i in range(n):
    k = 0
    from prettytable import PrettyTable
    pt = PrettyTable()
    pt.field_names = ["Classifier",
                 "KFold Iteration",
                 "False Positives",
                 "False Negative",
                 "Accuracy"]
    # For each train test split in the KFold
    for train_index, test_index in kf.split(X):
        s = ""
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # Feature Scaling
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)
        
        # Fitting Logistic Regression to the Training set
        if i == 0:
            classifier = LogisticRegression(solver='liblinear')
            s = "Logistic Regression"        
        if i == 1:
            classifier = KNeighborsClassifier(n_neighbors = 5)
            s = "KNeighborsClassifier 5 Neighbours"        
        if i == 2:
            classifier = SVC(kernel = "linear")
            s = "SVC Linear"        
        if i == 3:
            classifier = SVC(kernel = "rbf")
            s = "SVC rbf"        
        if i == 4:
            classifier = GaussianNB()
            s = "GaussianNB"        
        if i == 5:
            classifier = DecisionTreeClassifier()
            s = "DecisionTreeClassifier"        
        if i == 6:
            classifier = RandomForestClassifier(n_estimators = 100)
            s = "RandomForestClassifier n_estimators = 100"        
        if i == 7:
            classifier = RandomForestClassifier(n_estimators = 150)
            s = "RandomForestClassifier n_estimators = 150"        
        if i == 8:
            classifier = RandomForestClassifier(n_estimators = 200)
            s = "RandomForestClassifier n_estimators = 200"        
        classifier.fit(X_train, y_train)
        
        # Predicting the Test set results
        y_pred = classifier.predict(X_test)

        # Making the Confusion Matrix
        from sklearn.metrics import confusion_matrix
        cm = confusion_matrix(y_test, y_pred)
        # Finding the Flase Negatives
        FN = cm[1][0]
        
        # Finding the False Positives
        FP = cm[0][1]
        
        # Calculating the accuracy
        score = round(classifier.score(X_test, y_test)*100,2)
        
        C[k] = score
        k += 1
        
        # Adding a row to the table
        pt.add_row([s,k, FN,FP,score])
        
        
    avg = np.mean(C)
    
    # Adding the final row to the table
    pt.add_row([s,"AVG", "-","-",round(avg,2).astype(str)])
        
    print(pt)
    if avg > maxi:
        maxi =avg
        index = s


+---------------------+-----------------+-----------------+----------------+----------+
|      Classifier     | KFold Iteration | False Positives | False Negative | Accuracy |
+---------------------+-----------------+-----------------+----------------+----------+
| Logistic Regression |        1        |        25       |       12       |  91.97   |
| Logistic Regression |        2        |        22       |       15       |  91.96   |
| Logistic Regression |        3        |        21       |       8        |   93.7   |
| Logistic Regression |        4        |        21       |       10       |  93.26   |
| Logistic Regression |        5        |        19       |       12       |  93.26   |
| Logistic Regression |        6        |        19       |       15       |  92.61   |
| Logistic Regression |        7        |        13       |       22       |  92.39   |
| Logistic Regression |        8        |        30       |       11       |  91.09   |
| Logistic Regression |        9

+-------------------------------------------+-----------------+-----------------+----------------+----------+
|                 Classifier                | KFold Iteration | False Positives | False Negative | Accuracy |
+-------------------------------------------+-----------------+-----------------+----------------+----------+
| RandomForestClassifier n_estimators = 150 |        1        |        14       |       6        |  95.66   |
| RandomForestClassifier n_estimators = 150 |        2        |        12       |       6        |  96.09   |
| RandomForestClassifier n_estimators = 150 |        3        |        15       |       8        |   95.0   |
| RandomForestClassifier n_estimators = 150 |        4        |        15       |       7        |  95.22   |
| RandomForestClassifier n_estimators = 150 |        5        |        9        |       8        |   96.3   |
| RandomForestClassifier n_estimators = 150 |        6        |        13       |       9        |  95.22   |
| RandomFo

In [9]:
print(KF," Fold :",index," : ",round(maxi,2))

10  Fold : RandomForestClassifier n_estimators = 100  :  95.59
