In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
#Importando os dados e tratando a tabela
emails = pd.read_csv('/content/spam.csv', encoding='latin-1')
emails = emails.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis=1)
emails = emails.rename(columns={'v1': 'categoria', 'v2': 'mensagens' })
print(emails)

     categoria                                          mensagens
0          ham  Go until jurong point, crazy.. Available only ...
1          ham                      Ok lar... Joking wif u oni...
2         spam  Free entry in 2 a wkly comp to win FA Cup fina...
3          ham  U dun say so early hor... U c already then say...
4          ham  Nah I don't think he goes to usf, he lives aro...
...        ...                                                ...
5567      spam  This is the 2nd time we have tried 2 contact u...
5568       ham              Will Ì_ b going to esplanade fr home?
5569       ham  Pity, * was in mood for that. So...any other s...
5570       ham  The guy did some bitching but I acted like i'd...
5571       ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [3]:
#Criando uma coluna chamada spam 
emails['spam'] = emails['categoria'].apply(lambda x:1 if x=='spam' else 0)
print(emails)

     categoria                                          mensagens  spam
0          ham  Go until jurong point, crazy.. Available only ...     0
1          ham                      Ok lar... Joking wif u oni...     0
2         spam  Free entry in 2 a wkly comp to win FA Cup fina...     1
3          ham  U dun say so early hor... U c already then say...     0
4          ham  Nah I don't think he goes to usf, he lives aro...     0
...        ...                                                ...   ...
5567      spam  This is the 2nd time we have tried 2 contact u...     1
5568       ham              Will Ì_ b going to esplanade fr home?     0
5569       ham  Pity, * was in mood for that. So...any other s...     0
5570       ham  The guy did some bitching but I acted like i'd...     0
5571       ham                         Rofl. Its true to its name     0

[5572 rows x 3 columns]


In [4]:
#Dividindo nossa tabela em treino e teste. Forçando uma divisão em 80/20
x_treino, x_teste, y_treino, y_teste = train_test_split(emails.mensagens, emails.spam, test_size= 0.20)

In [5]:
#Nossa base de treino
x_treino.describe()

count                       4457
unique                      4175
top       Sorry, I'll call later
freq                          24
Name: mensagens, dtype: object

In [6]:
#Nossa base de teste
x_teste.describe()

count                       1115
unique                      1092
top       Sorry, I'll call later
freq                           6
Name: mensagens, dtype: object

In [7]:
#chamamos a função CountVectorizer com o fit_transform para transformar
#nossa base de treinamento em uma matriz  
cv = CountVectorizer()
x_treino_num = cv.fit_transform(x_treino.values)
x_treino_num

<4457x7687 sparse matrix of type '<class 'numpy.int64'>'
	with 58648 stored elements in Compressed Sparse Row format>

In [8]:
#Ajustamos a nossa base de mensagens de treino já transformada com nossa base de verificação. 
modelo = MultinomialNB()
modelo.fit(x_treino_num, y_treino)

In [9]:
#Testando o modelo com um exemplo de email ham
email_ham = ["do you go to IA class?"]
email_ham_num = cv.transform(email_ham)
modelo.predict(email_ham_num)

array([0])

In [10]:
#Testando o modelo com um exemplo de email spam
email_spam = ["click here for reward"]
email_spam_num = cv.transform(email_spam)
modelo.predict(email_spam_num)

array([1])

In [11]:
#Testando nosso modelo com nossa base de teste
x_teste_num = cv.transform(x_teste)
modelo.score(x_teste_num,y_teste)

0.9865470852017937