In [12]:
import re
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


In [13]:
# Chargement des données/Loading data
df = pd.read_csv("../data/spam.csv")
df = df.rename(columns=str.strip)
df.head()


Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
# Nettoyage des données/Data cleaning
df["Category"] = df["Category"].astype(str).str.lower().str.strip()
df["Message"]  = df["Message"].astype(str)

df = df.dropna(subset=["Category", "Message"])
df.head()


Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [15]:
# Encodage des labels/Label encoding
df["label"] = df["Category"].map({"ham": 0, "spam": 1})
df = df.dropna(subset=["label"])
df["label"] = df["label"].astype(int)

df[["Category", "label"]].head()


Unnamed: 0,Category,label
0,ham,0
1,ham,0
2,spam,1
3,ham,0
4,ham,0


In [16]:
#vérivication de l'encodage/verification of the encoding
print("Shape:", df.shape)
print(df["Category"].value_counts())
print(df["label"].value_counts())


Shape: (5572, 3)
Category
ham     4825
spam     747
Name: count, dtype: int64
label
0    4825
1     747
Name: count, dtype: int64


<ul>
<li>Normalisation des labels (lower/strip)</li>

<li>Suppression des lignes manquantes</li>

<li>Encodage ham=0, spam=1</li>

<li>Le texte sera vectorisé avec TF-IDF (approche standard en classification de texte)</li>
</ul>

In [None]:
# Division des données en ensembles d'entraînement et de test/Splitting data into training and testing sets
X = df["Message"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train size:", len(X_train))
print("Test size :", len(X_test))
print("Train label distribution:\n", y_train.value_counts(normalize=True))
print("Test label distribution:\n", y_test.value_counts(normalize=True))


Train size: 4457
Test size : 1115
Train label distribution:
 label
0    0.865829
1    0.134171
Name: proportion, dtype: float64
Test label distribution:
 label
0    0.866368
1    0.133632
Name: proportion, dtype: float64


In [None]:
# Extraction des caractéristiques TF-IDF/TF-IDF feature extraction
tfidf_word = TfidfVectorizer(
    lowercase=True,
    stop_words="english",
    ngram_range=(1, 2),
    min_df=2
)

X_train_word = tfidf_word.fit_transform(X_train)
X_test_word  = tfidf_word.transform(X_test)

X_train_word.shape, X_test_word.shape


((4457, 7351), (1115, 7351))

In [None]:
# Optimized TF-IDF with spam pattern normalization
def normalize_spam_patterns(text: str) -> str:
    text = text.lower()
    text = re.sub(r"http\S+|www\.\S+", " __URL__ ", text)
    text = re.sub(r"\b[\w\.-]+@[\w\.-]+\.\w+\b", " __EMAIL__ ", text)
    text = re.sub(r"\b\d+(\.\d+)?\b", " __NUMBER__ ", text)
    text = re.sub(r"[$€£]\s*\d+", " __MONEY__ ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

tfidf_word_opt = TfidfVectorizer(
    preprocessor=normalize_spam_patterns,
    stop_words="english",
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    sublinear_tf=True,
    strip_accents="unicode"
)

X_train_opt = tfidf_word_opt.fit_transform(X_train)
X_test_opt  = tfidf_word_opt.transform(X_test)

X_train_opt.shape, X_test_opt.shape


((4457, 7155), (1115, 7155))

In [None]:
# Caractères n-grammes/Character n-grams
tfidf_char = TfidfVectorizer(
    analyzer="char_wb",
    ngram_range=(3, 5),
    min_df=2
)

X_train_char = tfidf_char.fit_transform(X_train)
X_test_char  = tfidf_char.transform(X_test)

X_train_char.shape, X_test_char.shape


((4457, 34679), (1115, 34679))

In [21]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

nb = MultinomialNB()

# Test sur baseline word
nb.fit(X_train_word, y_train)
pred_word = nb.predict(X_test_word)
print("=== NB + TFIDF(word) ===")
print(classification_report(y_test, pred_word, target_names=["ham", "spam"]))

# Test sur word optimisé
nb.fit(X_train_opt, y_train)
pred_opt = nb.predict(X_test_opt)
print("\n=== NB + TFIDF(word OPT) ===")
print(classification_report(y_test, pred_opt, target_names=["ham", "spam"]))


=== NB + TFIDF(word) ===
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       966
        spam       1.00      0.77      0.87       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115


=== NB + TFIDF(word OPT) ===
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       966
        spam       1.00      0.80      0.89       149

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [22]:

import os

os.listdir("../data")


['spam.csv']

<h1>Conclusion</h1>
<p><strong>
Le dataset a été préparé en normalisant les labels, en encodant la cible (ham=0, spam=1), puis en effectuant un split stratifié.
Le texte a été transformé en features numériques via TF-IDF.
Une optimisation “spam-aware” a été testée en normalisant les URLs/emails/nombres, et une approche char-level a également été préparée pour améliorer la robustesse face aux variantes de spam.
</strong></p>