In [65]:
import numpy as np 
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
import pickle

In [21]:
raw_mail_data = pd.read_csv('./mail_data.csv')
print(raw_mail_data)

     Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567     spam  This is the 2nd time we have tried 2 contact u...
5568      ham               Will ü b going to esplanade fr home?
5569      ham  Pity, * was in mood for that. So...any other s...
5570      ham  The guy did some bitching but I acted like i'd...
5571      ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [22]:
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [23]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [24]:
mail_data.shape

(5572, 2)

In [25]:
mail_data.loc[mail_data['Category'] == 'spam', 'Category',] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category',] = 1

In [26]:
mail_data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [27]:
mail_data.columns

Index(['Category', 'Message'], dtype='object')

In [29]:
mail_data.drop_duplicates(inplace = True)

In [30]:
mail_data.shape

(5157, 2)

In [31]:
mail_data.isnull().sum()

Category    0
Message     0
dtype: int64

In [32]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\HA
[nltk_data]     GROUP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [35]:
def process_text(text):
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    
    
    clean_words = [word for word in nopunc.split() if word.lower not in stopwords.words('english')]
    
    return clean_words

In [36]:
mail_data['Message'].head().apply(process_text)

0    [Go, until, jurong, point, crazy, Available, o...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, in, 2, a, wkly, comp, to, win, F...
3    [U, dun, say, so, early, hor, U, c, already, t...
4    [Nah, I, dont, think, he, goes, to, usf, he, l...
Name: Message, dtype: object

In [38]:
#convertir une collection de text a une matrice de token 
from sklearn.feature_extraction.text import CountVectorizer
message_bow = CountVectorizer(analyzer=process_text).fit_transform(mail_data['Message'])

In [40]:
# 80% 20%
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(message_bow, mail_data['Category'], test_size=0.20, random_state = 0)



In [49]:
mail_data.head()

array([1, 0, 1, ..., 1, 1, 1])

In [47]:
#Naive Bayes Classifier
y_train = np.array(y_train, dtype=int)
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB().fit(X_train,y_train)



In [52]:
print(classifier.predict(X_train))
print(y_train)

[1 0 1 ... 1 1 1]
[1 0 1 ... 1 1 1]


In [56]:
#evaluation
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier.predict(X_train)
print(classification_report(y_train, pred))
print("\nConfusion Matrics:\n", confusion_matrix(y_train, pred))
print("\nAccuracy:\n", accuracy_score(y_train,pred))

              precision    recall  f1-score   support

           0       0.97      0.95      0.96       506
           1       0.99      1.00      0.99      3619

    accuracy                           0.99      4125
   macro avg       0.98      0.97      0.98      4125
weighted avg       0.99      0.99      0.99      4125


Confusion Matrics:
 [[ 482   24]
 [  14 3605]]

Accuracy:
 0.9907878787878788


In [60]:
print(classifier.predict(X_test))
print(y_test.values)

[1 1 1 ... 1 1 1]
[1 1 1 ... 1 1 1]


In [66]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
y_test=np.array(y_test,dtype=int)
pred = classifier.predict(X_test)
print(classification_report(y_test, pred))
print("\nConfusion Matrics:\n", confusion_matrix(y_test, pred))
print("\nAccuracy:\n", accuracy_score(y_test,pred))

filename = "navebayesModel"
pickle.dump(classifier,open(filename,'wb'))

              precision    recall  f1-score   support

           0       0.86      0.92      0.89       135
           1       0.99      0.98      0.98       897

    accuracy                           0.97      1032
   macro avg       0.92      0.95      0.93      1032
weighted avg       0.97      0.97      0.97      1032


Confusion Matrics:
 [[124  11]
 [ 21 876]]

Accuracy:
 0.9689922480620154


In [67]:
loaded_model = pickle.load(open(filename,"rb"))
loaded_model.predict(X_test)

array([1, 1, 1, ..., 1, 1, 1])