# By OUAMBA Fred Harisson

In [56]:
#Importation des bibliothèques
import pandas as pd

In [57]:
#Chargement du jeu de données
mails = pd.read_csv(r'spam_ham_dataset.csv')
mails.head(8)

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
5,2949,ham,Subject: ehronline web address change\r\nthis ...,0
6,2793,ham,Subject: spring savings certificate - take 30 ...,0
7,4185,spam,Subject: looking for medication ? we ` re the ...,1


In [58]:
#Vérification des valeurs manquantes
mails.isna().sum(axis = 0)

Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64

In [59]:
#Suppression la colonne inutile
mails.drop('Unnamed: 0', axis =1, inplace= True)

In [60]:
#Objet/Modèle permettant de construire le vocabulaire
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words='english', analyzer='word')

In [61]:
#Calcul des inputs et labels
X = mails.loc[:,'text']
y = mails.loc[:,'label_num']

In [62]:
#Découpage du dataset en données d'entrainement et de test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.15, random_state= 20)

In [63]:
#Construction du vocabulaire
vectorizer.fit(X_train)
print(len(vectorizer.get_feature_names()))

47173


In [64]:
#Calcul de la matrice des données basée sur le vocabulaire
X_train2 = vectorizer.transform(X_train)
X_train2.toarray()

array([[7, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [65]:
#Construction d'un GridSearch pour construire le meilleur modèle de régression logistique
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

params = {
    "C" : [1.0,2.0,3.0],
    "penalty": ["l2"],
    "solver": ["lbfgs","liblinear"]
}
lr = LogisticRegression()
gs = GridSearchCV(lr, params, cv=4)
gs.fit(X_train2, y_train)

GridSearchCV(cv=4, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1.0, 2.0, 3.0], 'penalty': ['l2'],
                         'solver': ['lbfgs', 'liblinear']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [66]:
#Construction du modèle de régression logistique optimal
lr= LogisticRegression(C=1.0, penalty ="l2", solver="liblinear")
lr.fit(X_train2,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [67]:
#Calcul de la précision sur les données d'entrainement
sc1 = lr.score(X_train2,y_train)
print("Score sur les données d'entrainement:",sc1)

Score sur les données d'entrainement: 0.9995449374288965


In [68]:
#Calcul de la précision sur les données de test
X_test2 = vectorizer.transform(X_test)
sc2 = lr.score(X_test2, y_test)
print("Score sur les données d'entrainement:",sc2)

Score sur les données d'entrainement: 0.9832474226804123


In [69]:
def class_spam_function(cv : CountVectorizer, lr : LogisticRegression, mail: str):
  mail2 = cv.transform([mail])
  lab = lr.predict(mail2)
  print("Spam" if lab is 1 else "Non Spam")

In [70]:
mail ="Date: Fri, 29 Oct 2021 17:40:26 +0100 \n From: Marianna Jenkins <1n4fs0j6oal@gmx.com> \n Subject: Re: Hi Fred \nThread-Topic: Hi Fred \n Message-ID: \n <trinity-87a6c4f5-4a9b-4e44-9633-5515032a923a-1635525625432@3c-app-mailcom-bs06> \n To: \"harissonfred2@gmail.com\" <harissonfred2@gmail.com> \n Content-Transfer-Encoding: quoted-printable \n Content-Type: text/html; charset=\"utf-8\""
class_spam_function(vectorizer, lr, mail)

Non Spam


In [71]:
#A la base, c'est un mail spam que j'ai pris dans ma boite.
#Le résultat obtenu prouve que le dataset est assez bon et qu'il
#pourrait servir de généralité

In [53]:
import pickle
pickle.dump(vectorizer,open("Cvmodel.sav",'wb'))
