# Spam classification 

In [2]:
import requests
import zipfile
import io

# URL of the dataset
url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
# Download the dataset
response = requests.get(url)
if response.status_code == 200:
    print("Download successful")
else:
    print("Failed to download the dataset")

Download successful


In [3]:
# Extract the dataset
with zipfile.ZipFile(io.BytesIO(response.content)) as z:
    z.extractall("sms_spam_collection")
    print("Extraction successful")

Extraction successful


In [4]:
import os

# List the extracted files
extracted_files = os.listdir("sms_spam_collection")
print("Extracted files:", extracted_files)

Extracted files: ['readme', 'SMSSpamCollection']


In [5]:
import pandas as pd

# Load the dataset
df = pd.read_csv(
    "sms_spam_collection/SMSSpamCollection",
    sep="\t",
    header=None,
    names=["label", "message"],
)
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df.label.unique()

array(['ham', 'spam'], dtype=object)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [8]:
# Check for duplicates
print("Duplicate entries:", df.duplicated().sum())

# Remove duplicates if any
df = df.drop_duplicates()

Duplicate entries: 403


## Processing data

In [9]:
import nltk

# Download the necessary NLTK data files
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")

print("=== BEFORE ANY PREPROCESSING ===") 
print(df.head(5))

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
# Convert all message text to lowercase
df["message"] = df["message"].str.lower()
print("\n=== AFTER LOWERCASING ===")
print(df["message"].head(5))


=== AFTER LOWERCASING ===
0    go until jurong point, crazy.. available only ...
1                        ok lar... joking wif u oni...
2    free entry in 2 a wkly comp to win fa cup fina...
3    u dun say so early hor... u c already then say...
4    nah i don't think he goes to usf, he lives aro...
Name: message, dtype: object


In [None]:
import re

# Remove non-essential punctuation and numbers, keep useful symbols like $ and !
df["message"] = df["message"].apply(lambda x: re.sub(r"[^a-z\s$!]", "", x))
print("\n=== AFTER REMOVING PUNCTUATION & NUMBERS (except $ and !) ===")
print(df["message"].head(5))


=== AFTER REMOVING PUNCTUATION & NUMBERS (except $ and !) ===
0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in  a wkly comp to win fa cup final...
3          u dun say so early hor u c already then say
4    nah i dont think he goes to usf he lives aroun...
Name: message, dtype: object


In [None]:
from nltk.tokenize import word_tokenize

# Split each message into individual tokens
df["message"] = df["message"].apply(word_tokenize)
print("\n=== AFTER TOKENIZATION ===")
print(df["message"].head(5))


=== AFTER TOKENIZATION ===
0    [go, until, jurong, point, crazy, available, o...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, in, a, wkly, comp, to, win, fa, ...
3    [u, dun, say, so, early, hor, u, c, already, t...
4    [nah, i, dont, think, he, goes, to, usf, he, l...
Name: message, dtype: object


In [None]:
from nltk.corpus import stopwords

# Define a set of English stop words and remove them from the tokens
stop_words = set(stopwords.words("english"))
df["message"] = df["message"].apply(lambda x: [word for word in x if word not in stop_words])
print("\n=== AFTER REMOVING STOP WORDS ===")
print(df["message"].head(5))


=== AFTER REMOVING STOP WORDS ===
0    [go, jurong, point, crazy, available, bugis, n...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, wkly, comp, win, fa, cup, final,...
3        [u, dun, say, early, hor, u, c, already, say]
4    [nah, dont, think, goes, usf, lives, around, t...
Name: message, dtype: object


In [None]:
from nltk.stem import PorterStemmer

# Stem each token to reduce words to their base form
stemmer = PorterStemmer()
df["message"] = df["message"].apply(lambda x: [stemmer.stem(word) for word in x])
print("\n=== AFTER STEMMING ===")
print(df["message"].head(5))


=== AFTER STEMMING ===
0    [go, jurong, point, crazi, avail, bugi, n, gre...
1                         [ok, lar, joke, wif, u, oni]
2    [free, entri, wkli, comp, win, fa, cup, final,...
3        [u, dun, say, earli, hor, u, c, alreadi, say]
4    [nah, dont, think, goe, usf, live, around, tho...
Name: message, dtype: object


In [None]:
# Rejoin tokens into a single string for feature extraction
df["message"] = df["message"].apply(lambda x: " ".join(x))
print("\n=== AFTER JOINING TOKENS BACK INTO STRINGS ===")
print(df["message"].head(5))


=== AFTER JOINING TOKENS BACK INTO STRINGS ===
0    go jurong point crazi avail bugi n great world...
1                                ok lar joke wif u oni
2    free entri wkli comp win fa cup final tkt st m...
3                  u dun say earli hor u c alreadi say
4            nah dont think goe usf live around though
Name: message, dtype: object


## Feature Extraction

On this case we are going to transform the words into numerical vectors.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer with bigrams, min_df, and max_df to focus on relevant terms
vectorizer = CountVectorizer(min_df=1, max_df=0.9, ngram_range=(1, 2))

# Fit and transform the message column
X = vectorizer.fit_transform(df["message"])

# Labels (target variable)
y = df["label"].apply(lambda x: 1 if x == "spam" else 0)  # Converting labels to 1 and 0

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Build the pipeline by combining vectorization and classification
pipeline = Pipeline([
    ("vectorizer", vectorizer),
    ("classifier", MultinomialNB())
])

In [None]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    "classifier__alpha": [0.01, 0.1, 0.15, 0.2, 0.25, 0.5, 0.75, 1.0]
}

# Perform the grid search with 5-fold cross-validation and the F1-score as metric
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring="f1"
)

# Fit the grid search on the full dataset
grid_search.fit(df["message"], y)

# Extract the best model identified by the grid search
best_model = grid_search.best_estimator_
print("Best model parameters:", grid_search.best_params_)

Best model parameters: {'classifier__alpha': 0.25}


## evaluation

In [None]:
# Example SMS messages for evaluation
new_messages = [
    "Congratulations! You've won a $1000 Walmart gift card. Go to http://bit.ly/1234 to claim now.",
    "Hey, are we still meeting up for lunch today?",
    "Urgent! Your account has been compromised. Verify your details here: www.fakebank.com/verify",
    "Reminder: Your appointment is scheduled for tomorrow at 10am.",
    "FREE entry in a weekly competition to win an iPad. Just text WIN to 80085 now!",
]

In [None]:
import numpy as np
import re

# Preprocess function that mirrors the training-time preprocessing
def preprocess_message(message):
    message = message.lower()
    message = re.sub(r"[^a-z\s$!]", "", message)
    tokens = word_tokenize(message)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [stemmer.stem(word) for word in tokens]
    return " ".join(tokens)

In [None]:
# Preprocess and vectorize messages
processed_messages = [preprocess_message(msg) for msg in new_messages]

In [None]:
# Transform preprocessed messages into feature vectors
X_new = best_model.named_steps["vectorizer"].transform(processed_messages)

In [None]:
# Predict with the trained classifier
predictions = best_model.named_steps["classifier"].predict(X_new)
prediction_probabilities = best_model.named_steps["classifier"].predict_proba(X_new)

In [None]:
# Display predictions and probabilities for each evaluated message
for i, msg in enumerate(new_messages):
    prediction = "Spam" if predictions[i] == 1 else "Not-Spam"
    spam_probability = prediction_probabilities[i][1]  # Probability of being spam
    ham_probability = prediction_probabilities[i][0]   # Probability of being not spam
    
    print(f"Message: {msg}")
    print(f"Prediction: {prediction}")
    print(f"Spam Probability: {spam_probability:.2f}")
    print(f"Not-Spam Probability: {ham_probability:.2f}")
    print("-" * 50)

Message: Congratulations! You've won a $1000 Walmart gift card. Go to http://bit.ly/1234 to claim now.
Prediction: Spam
Spam Probability: 1.00
Not-Spam Probability: 0.00
--------------------------------------------------
Message: Hey, are we still meeting up for lunch today?
Prediction: Not-Spam
Spam Probability: 0.00
Not-Spam Probability: 1.00
--------------------------------------------------
Message: Urgent! Your account has been compromised. Verify your details here: www.fakebank.com/verify
Prediction: Spam
Spam Probability: 0.96
Not-Spam Probability: 0.04
--------------------------------------------------
Message: Reminder: Your appointment is scheduled for tomorrow at 10am.
Prediction: Not-Spam
Spam Probability: 0.00
Not-Spam Probability: 1.00
--------------------------------------------------
Message: FREE entry in a weekly competition to win an iPad. Just text WIN to 80085 now!
Prediction: Spam
Spam Probability: 1.00
Not-Spam Probability: 0.00
----------------------------------

## Saving the model with joblib

In [None]:
import joblib

# Save the trained model to a file for future use
model_filename = 'spam_detection_model.joblib'
joblib.dump(best_model, model_filename)

print(f"Model saved to {model_filename}")

Model saved to spam_detection_model.joblib


## loading the model

In [None]:
loaded_model = joblib.load(model_filename)
predictions = loaded_model.predict(new_messages)

In [None]:
loaded_model.predict(["Urgent! my acccount has been hacked!"])

array([1])

## Deep learning 

https://huggingface.co/Goodmotion/spam-mail-classifier

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "Goodmotion/spam-mail-classifier"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

texts = [
'Join us for a webinar on AI innovations',
'Urgent: Verify your account immediately.',
'Meeting rescheduled to 3 PM',
'Happy Birthday!',
'Limited time offer: Act now!',
'Join us for a webinar on AI innovations',
'Claim your free prize now!',
'You have unclaimed rewards waiting!',
'Weekly newsletter from Tech World',
'Update on the project status',
'Lunch tomorrow at 12:30?',
'Get rich quick with this amazing opportunity!',
'Invoice for your recent purchase',
'Don\'t forget: Gym session at 6 AM',
'Join us for a webinar on AI innovations',
'bonjour comment allez vous ?',
'Documents suite à notre rendez-vous',
'Valentin Dupond mentioned you in a comment',
'Bolt x Supabase = 🤯',
'Modification site web de la société',
'Image de mise en avant sur les articles',
'Bring new visitors to your site',
'Le Cloud Éthique sans bullshit',
'Remix Newsletter #25: React Router v7',
'Votre essai auprès de X va bientôt prendre fin',
'Introducing a Google Docs integration, styles and more in Claude.ai',
'Carte de crédit sur le point d’expirer sur Cloudflare'
]
inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
with torch.no_grad():  # Désactiver le calcul des gradients pour l'inférence
    outputs = model(**inputs)

# Convertir les logits en probabilités avec softmax
logits = outputs.logits
probabilities = torch.softmax(logits, dim=1)

# Décoder les classes pour chaque texte
labels = ["NOSPAM", "SPAM"]  # Mapping des indices à des labels
results = [
    {"text": text, "label": labels[torch.argmax(prob).item()], "confidence": prob.max().item()}
    for text, prob in zip(texts, probabilities)
]

# Afficher les résultats
for result in results:
    print(f"Texte : {result['text']}")
    print(f"Résultat : {result['label']} (Confiance : {result['confidence']:.2%})\n")



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.4 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/curcuqui/Documents/GitHub/Workshops/.venv/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/curcuqui/Documents/GitHub/Workshops/.venv/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/curcuqui/Documents/GitHub/Workshops/.venv/lib/python3.12/site-packages/ipykernel/

Texte : Join us for a webinar on AI innovations
Résultat : NOSPAM (Confiance : 99.91%)

Texte : Urgent: Verify your account immediately.
Résultat : SPAM (Confiance : 99.92%)

Texte : Meeting rescheduled to 3 PM
Résultat : NOSPAM (Confiance : 99.91%)

Texte : Happy Birthday!
Résultat : NOSPAM (Confiance : 99.91%)

Texte : Limited time offer: Act now!
Résultat : SPAM (Confiance : 99.92%)

Texte : Join us for a webinar on AI innovations
Résultat : NOSPAM (Confiance : 99.91%)

Texte : Claim your free prize now!
Résultat : SPAM (Confiance : 99.92%)

Texte : You have unclaimed rewards waiting!
Résultat : SPAM (Confiance : 99.92%)

Texte : Weekly newsletter from Tech World
Résultat : NOSPAM (Confiance : 99.91%)

Texte : Update on the project status
Résultat : NOSPAM (Confiance : 99.91%)

Texte : Lunch tomorrow at 12:30?
Résultat : NOSPAM (Confiance : 99.91%)

Texte : Get rich quick with this amazing opportunity!
Résultat : SPAM (Confiance : 99.92%)

Texte : Invoice for your recent purchase
Ré

In [2]:
inputs = tokenizer(texts[0], padding=True, truncation=True, max_length=128, return_tensors="pt")

with torch.no_grad():  # Désactiver le calcul des gradients pour l'inférence
    outputs = model(**inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[ 3.6659, -3.3959]]), hidden_states=None, attentions=None)

In [3]:
texts[0]

'Join us for a webinar on AI innovations'

In [4]:
logits = outputs.logits
probabilities = torch.softmax(logits, dim=1)

# Décoder les classes pour chaque texte
labels = ["NOSPAM", "SPAM"]  # Mapping des indices à des labels
results = [
    {"text": text, "label": labels[torch.argmax(prob).item()], "confidence": prob.max().item()}
    for text, prob in zip(texts, probabilities)
]
results

[{'text': 'Join us for a webinar on AI innovations',
  'label': 'NOSPAM',
  'confidence': 0.9991434812545776}]