In [1]:
# Prétraitement des messages pour la détection de spam
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
df = pd.read_csv("../data/spam.csv")
df.head()


Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
print("Taille du dataset :", df.shape)
print("Valeurs manquantes :\n", df.isnull().sum())

Taille du dataset : (5572, 2)
Valeurs manquantes :
 Category    0
Message     0
dtype: int64


In [4]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


In [5]:
# Vérification
print(df['Message'].head())

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: Message, dtype: object


In [6]:
# Séparer les données en Train / Test
X = df['Message']
y = df['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42
)

In [7]:
# ===============================
# Prétraitement avancé des messages
# ===============================

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Télécharger les stopwords si nécessaire
nltk.download('stopwords')

# Charger le dataset
df = pd.read_csv("../data/spam.csv")

# Initialiser stopwords et stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Fonction de prétraitement
def preprocess_text(text):
    # 1. Convertir en minuscules
    text = text.lower()
    # 2. Supprimer ponctuation et caractères spéciaux
    text = re.sub(r'\W', ' ', text)
    # 3. Supprimer les chiffres
    text = re.sub(r'\d', '', text)
    # 4. Supprimer les espaces en trop
    text = text.strip()
    # 5. Supprimer les stopwords et appliquer le stemming
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Appliquer la fonction sur la colonne 'message'
df['Message'] = df['Message'].apply(preprocess_text)

# Vérification
print(df['Message'].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Poste\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0    go jurong point crazi avail bugi n great world...
1                                ok lar joke wif u oni
2    free entri wkli comp win fa cup final tkt st m...
3                  u dun say earli hor u c alreadi say
4                 nah think goe usf live around though
Name: Message, dtype: object


In [8]:
print("Nombre de messages d'entraînement :", X_train.shape[0])
print("Nombre de messages de test :", X_test.shape[0])

Nombre de messages d'entraînement : 4457
Nombre de messages de test : 1115


In [9]:
# Vectorisation (TF-IDF)
vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

In [10]:
print("Shape X_train_vect :", X_train_vect.shape)
print("Shape X_test_vect :", X_test_vect.shape)

Shape X_train_vect : (4457, 7701)
Shape X_test_vect : (1115, 7701)


In [11]:
# Sauvegarder le dataset prétraité
df.to_csv("../data/spam_clean.csv", index=False)


In [13]:
df.head()

Unnamed: 0,Category,Message
0,ham,go jurong point crazi avail bugi n great world...
1,ham,ok lar joke wif u oni
2,spam,free entri wkli comp win fa cup final tkt st m...
3,ham,u dun say earli hor u c alreadi say
4,ham,nah think goe usf live around though


In [15]:
import pickle

# Charger le meilleur modèle
with open("../models/best_model.pkl", "rb") as f:
    model = pickle.load(f)

# Charger le vectorizer
with open("../models/vectorizer.pkl", "rb") as f:
    vectorizer = pickle.load(f)

# Exemples de nouveaux messages
new_messages = [
    "Congratulations! You have won a free iPhone!",
    "Hey, are we still meeting at 6 PM?",
    "Get cheap loans instantly, apply now!"
]

# Transformer les messages avec le même vectorizer
new_vect = vectorizer.transform(new_messages)

# Prédictions
predictions = model.predict(new_vect)

# Affichage
for msg, pred in zip(new_messages, predictions):
    print(f"Message: {msg}")
    print(f"Prediction: {pred}\n")



Message: Congratulations! You have won a free iPhone!
Prediction: ham

Message: Hey, are we still meeting at 6 PM?
Prediction: ham

Message: Get cheap loans instantly, apply now!
Prediction: ham

