In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score
from sklearn.model_selection import StratifiedKFold


In [13]:

df = pd.read_csv('./spambase/spambase.data', delimiter=';')
df.head()


Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam_class
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [14]:
df.shape

(4601, 58)

In [None]:
for label in df.columns[:-1]:
    plt.scatter(df['spam_class'], df[label])
    plt.xticks(np.arange(0, 2, step = 1))
    plt.title(label)
    plt.xlabel('spam_class')
    plt.ylabel(label)
    plt.savefig('imgs/scatterplot/'+'scatterplot'.join(label.split(' ')))
    plt.show()

In [None]:
for i in df.columns[:-1]:
    media = np.mean(df[i])
    desvpad = np.std(df[i])

    print(f'{i}')
    print(f'media : {media}')
    print(f'desvio padrão : {desvpad}')
    print('-------')

In [18]:
x = df[df.columns[:-1]].values
y = df['spam_class'].values
sc = StandardScaler()
x = sc.fit_transform(x)

array([[-3.42433707e-01,  3.30884903e-01,  7.12858774e-01, ...,
        -4.52472762e-02,  4.52979198e-02, -8.72413388e-03],
       [ 3.45359395e-01,  5.19091945e-02,  4.35129540e-01, ...,
        -2.44326749e-03,  2.50562832e-01,  1.22832407e+00],
       [-1.45921392e-01, -1.65071912e-01,  8.51723390e-01, ...,
         1.45920848e-01,  2.22110599e+00,  3.25873251e+00],
       ...,
       [ 6.40127868e-01, -1.65071912e-01,  3.83734930e-02, ...,
        -1.19382054e-01, -2.36941335e-01, -2.72627750e-01],
       [ 2.80176333e+00, -1.65071912e-01, -5.56760578e-01, ...,
        -1.27482666e-01, -2.42072958e-01, -3.38603654e-01],
       [-3.42433707e-01, -1.65071912e-01,  7.32696576e-01, ...,
        -1.24236117e-01, -2.42072958e-01, -4.01280763e-01]])

In [19]:
model1 = [DecisionTreeClassifier(), "Decision Tree"]
model2 = [MLPClassifier(hidden_layer_sizes=(20,2), max_iter=1000), "MLP"]

models = [model1,model2]

In [20]:
skf = StratifiedKFold(n_splits=15)

for mod in models:
    acc = []
    f1 = []
    precision = []

    for train_indexs, test_indexs in skf.split(x, y):
        x_train, y_train = x[train_indexs], y[train_indexs]
        x_test, y_test = x[test_indexs], y[test_indexs]
        model = mod[0]
        model.fit(x_train, y_train)
        y_predicted = model.predict(x_test)
        accuracy = accuracy_score(y_test, y_predicted)
        f1_scr = f1_score(y_test, y_predicted)
        prec_score = precision_score(y_test, y_predicted)
        matrix = confusion_matrix(y_test, y_predicted)
        acc.append(accuracy)
        f1.append(f1_scr)
        precision.append(prec_score)
        print(matrix)
        print()

    print(f"{mod[1]}, Accuracy:\t{np.mean(acc):.2f} +/- {np.std(acc):.2f}")
    print(f"{mod[1]}, F1 Score:\t{np.mean(f1):.2f} +/- {np.std(f1):.2f}")
    print(f"{mod[1]}, Precision:\t{np.mean(precision):.2f} +/- {np.std(precision):.2f}")
    print("-"*60)

[[176  10]
 [ 17 104]]

[[173  13]
 [ 15 106]]

[[176  10]
 [ 18 103]]

[[176  10]
 [ 17 104]]

[[177   9]
 [ 12 109]]

[[169  17]
 [ 16 105]]

[[171  15]
 [ 10 111]]

[[172  14]
 [ 16 105]]

[[172  14]
 [ 10 111]]

[[181   5]
 [ 11 110]]

[[173  13]
 [ 14 107]]

[[166  19]
 [ 10 111]]

[[157  28]
 [ 14 107]]

[[124  62]
 [ 26  94]]

[[157  29]
 [ 14 106]]

Decision Tree, Accuracy:	0.89 +/- 0.05
Decision Tree, F1 Score:	0.87 +/- 0.06
Decision Tree, Precision:	0.86 +/- 0.08
------------------------------------------------------------
[[179   7]
 [ 13 108]]

[[173  13]
 [ 11 110]]

[[177   9]
 [ 12 109]]

[[180   6]
 [ 17 104]]

[[182   4]
 [ 13 108]]

[[178   8]
 [  7 114]]

[[180   6]
 [  8 113]]

[[177   9]
 [  6 115]]

[[174  12]
 [  5 116]]

[[179   7]
 [ 10 111]]

[[183   3]
 [ 12 109]]

[[175  10]
 [ 11 110]]

[[164  21]
 [  9 112]]

[[167  19]
 [ 26  94]]

[[179   7]
 [ 17 103]]

MLP, Accuracy:	0.93 +/- 0.03
MLP, F1 Score:	0.91 +/- 0.03
MLP, Precision:	0.92 +/- 0.04
-------------