In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score
from sklearn.model_selection import StratifiedKFold


In [2]:

df = pd.read_csv('./spambase/spambase.data', delimiter=';')
df.head()


Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam_class
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [3]:
df.shape

(4601, 58)

In [None]:
for label in df.columns[:-1]:
    plt.scatter(df['spam_class'], df[label])
    plt.xticks(np.arange(0, 2, step = 1))
    plt.title(label)
    plt.xlabel('spam_class')
    plt.ylabel(label)
    plt.savefig('imgs/scatterplot/'+'scatterplot'.join(label.split(' ')))
    plt.show()

In [None]:
for i in df.columns[:-1]:
    media = np.mean(df[i])
    desvpad = np.std(df[i])

    print(f'{i}')
    print(f'media : {media}')
    print(f'desvio padrão : {desvpad}')
    print('-------')

In [4]:
x = df[df.columns[:-1]].values
y = df['spam_class'].values

In [5]:
model1 = [DecisionTreeClassifier(), "Decision Tree"]
model2 = [MLPClassifier(hidden_layer_sizes=(10,2), max_iter=1000), "MLP"]

models = [model1,model2]

In [6]:
skf = StratifiedKFold(n_splits=10)

for mod in models:
    acc = []
    f1 = []
    precision = []

    for train_indexs, test_indexs in skf.split(x, y):
        x_train, y_train = x[train_indexs], y[train_indexs]
        x_test, y_test = x[test_indexs], y[test_indexs]
        model = mod[0]
        model.fit(x_train, y_train)
        y_predicted = model.predict(x_test)
        accuracy = accuracy_score(y_test, y_predicted)
        f1_scr = f1_score(y_test, y_predicted)
        prec_score = precision_score(y_test, y_predicted)
        matrix = confusion_matrix(y_test, y_predicted)
        acc.append(accuracy)
        f1.append(f1_scr)
        precision.append(prec_score)
        print(matrix)
        print()

    print(f"{mod[1]}, Accuracy:\t{np.mean(acc):.2f} +/- {np.std(acc):.2f}")
    print(f"{mod[1]}, F1 Score:\t{np.mean(f1):.2f} +/- {np.std(f1):.2f}")
    print(f"{mod[1]}, Precision:\t{np.mean(precision):.2f} +/- {np.std(precision):.2f}")
    print("-"*60)

[[264  15]
 [ 26 156]]

[[263  15]
 [ 27 155]]

[[257  21]
 [ 27 155]]

[[261  18]
 [ 25 156]]

[[260  19]
 [ 16 165]]

[[261  18]
 [ 14 167]]

[[272   7]
 [ 21 160]]

[[254  25]
 [ 16 165]]

[[230  49]
 [ 21 160]]

[[217  62]
 [ 30 151]]

Decision Tree, Accuracy:	0.90 +/- 0.04
Decision Tree, F1 Score:	0.87 +/- 0.04
Decision Tree, Precision:	0.87 +/- 0.07
------------------------------------------------------------
[[250  29]
 [ 15 167]]

[[230  48]
 [  8 174]]

[[259  19]
 [ 16 166]]

[[247  32]
 [ 11 170]]

[[264  15]
 [ 14 167]]

[[258  21]
 [  9 172]]

[[275   4]
 [ 14 167]]

[[271   8]
 [ 12 169]]

[[253  26]
 [ 18 163]]

[[241  38]
 [ 31 150]]

MLP, Accuracy:	0.92 +/- 0.03
MLP, F1 Score:	0.90 +/- 0.04
MLP, Precision:	0.88 +/- 0.06
------------------------------------------------------------
