# Naïve Bayes

In [1]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Base de dados para classificação de SPAM

Contains 48 continuous real (0, 100) attributes of type word_freq_WORD = percentage of words in the e-mail that match WORD,
i.e. 100 * (number of times the WORD appears in the e-mail) / total number of words in e-mail.  A "word" in this case is any 
string of alphanumeric characters bounded by non-alphanumeric characters or end-of-string.

6 continuous real (0,100) attributes of type char_freq_CHAR = percentage of characters in the e-mail that match CHAR, i.e. 100 * (number of CHAR occurences) / total characters in e-mail

1 continuous real attribute of type capital_run_length_average = average length of uninterrupted sequences of capital letters

1 continuous integer attribute of type capital_run_length_longest = length of longest uninterrupted sequence of capital letters

1 continuous integer attribute of type capital_run_length_total = sum of length of uninterrupted sequences of capital letters = total number of capital letters in the e-mail

1 nominal {0,1} class attribute of type spam = denotes whether the e-mail was considered spam (1) or not (0), i.e. unsolicited commercial e-mail.

Base disponível em: https://archive.ics.uci.edu/ml/datasets/spambase

In [2]:
dataset = np.loadtxt('spam.csv', delimiter=',')
print(dataset[0])

[  0.      0.64    0.64    0.      0.32    0.      0.      0.      0.
   0.      0.      0.64    0.      0.      0.      0.32    0.      1.29
   1.93    0.      0.96    0.      0.      0.      0.      0.      0.
   0.      0.      0.      0.      0.      0.      0.      0.      0.
   0.      0.      0.      0.      0.      0.      0.      0.      0.
   0.      0.      0.      0.      0.      0.      0.778   0.      0.
   3.756  61.    278.      1.   ]


In [3]:
X = dataset[:, 0:48]
y = dataset[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .40, random_state = 17)

In [4]:
MultiNB = MultinomialNB()
MultiNB.fit(X_train, y_train)
print(MultiNB)

y_expect = y_test
y_pred = MultiNB.predict(X_test)
print(accuracy_score(y_expect, y_pred))

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
0.8750678978815861


In [5]:
BernNB = BernoulliNB(binarize = 0.0)
BernNB.fit(X_train, y_train)
print(BernNB)

y_expect = y_test
y_pred = BernNB.predict(X_test)
print(accuracy_score(y_expect, y_pred))

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
0.8837588267246062


In [6]:
GausNB = GaussianNB()
GausNB.fit(X_train, y_train)
print(GausNB)

y_expect = y_test
y_pred = GausNB.predict(X_test)
print(accuracy_score(y_expect, y_pred))

GaussianNB(priors=None, var_smoothing=1e-09)
0.8126018468223791


In [12]:
BernNB = BernoulliNB(binarize = 0.2)
BernNB.fit(X_train, y_train)
print(BernNB)

y_expect = y_test
y_pred = BernNB.predict(X_test)
print(accuracy_score(y_expect, y_pred))

BernoulliNB(alpha=1.0, binarize=0.2, class_prior=None, fit_prior=True)
0.8935361216730038
