### Imports

In [6]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

from sklearn.svm import SVC

### Data Loading

In [7]:
df = pd.read_csv('sms+spam+collection/SMSSpamCollection', sep='\t', header=None, names=['label', 'message'])

# Convert labels to numerical values
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})
# Display the distribution of labels
print(f"Original distribution :\n{df['label'].value_counts(normalize=True)}")

#delete column 'label'
df = df.drop(columns=['label'])
print(df.head())

Original distribution :
label
ham     0.865937
spam    0.134063
Name: proportion, dtype: float64
                                             message  label_num
0  Go until jurong point, crazy.. Available only ...          0
1                      Ok lar... Joking wif u oni...          0
2  Free entry in 2 a wkly comp to win FA Cup fina...          1
3  U dun say so early hor... U c already then say...          0
4  Nah I don't think he goes to usf, he lives aro...          0


### 1. First, set aside a large TEST set (Validation) that we will NEVER touch during training

In [8]:
X_remaining, X_test, y_remaining, y_test = train_test_split(
    df['message'], df['label_num'], test_size=0.2, random_state=42, stratify=df['label_num']
)

### 2. Now, create a TINY training set from the remaining data
Let's say we only want 50 real examples to train on to make it "hard"

In [21]:
TRAIN_SIZE = 40 
X_train_real, _, y_train_real, _ = train_test_split(
    X_remaining, y_remaining, train_size=TRAIN_SIZE, random_state=42, stratify=y_remaining
)

print(f"Real Training Size: {len(X_train_real)}")
print(f"Test Set Size: {len(X_test)}")

Real Training Size: 40
Test Set Size: 1115


### Vectorization

In [22]:
# Create Vectorizer (TF-IDF)
# Note: stop_words='english' handles the preprocessing requirement
vectorizer = TfidfVectorizer(stop_words='english')

# Fit on TRAIN, transform TEST
X_train_real_vec = vectorizer.fit_transform(X_train_real)
X_test_vec = vectorizer.transform(X_test)

### Plot


In [23]:
# 1. FORCER LE MODE SANS ECRAN (Doit être fait avant tout autre import matplotlib)
import matplotlib
matplotlib.use('Agg')  # Force le mode sans interface graphique
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# 2. Réduction de dimension (PCA)
pca = PCA(n_components=2)
# X_train_real_vec est votre matrice TF-IDF issue de l'étape précédente
X_reduced = pca.fit_transform(X_train_real_vec.toarray())

# 3. Création du graphique
plt.figure(figsize=(10, 7))

# Scatter plot
scatter = plt.scatter(X_reduced[:,0], X_reduced[:,1], 
                      c=y_train_real, cmap='coolwarm', alpha=0.7, edgecolors='k')

plt.title('Visualisation 2D des SMS (PCA)')
plt.xlabel('Composante 1')
plt.ylabel('Composante 2')
plt.grid(True, alpha=0.3)

# 4. SAUVEGARDE (Au lieu de show)
# L'image sera enregistrée dans le même dossier que votre script/notebook
plt.savefig("resultat_pca_.png")
plt.xlim(-0.15, 0.0)
plt.ylim(-0.1, 0.1)
plt.savefig("resultat_pca_focus.png")
print(f"✅ Chart saved as 'resultat_pca_.png'.")


✅ Chart saved as 'resultat_pca_.png'.


### SVM Classifier

In [24]:
# Train Classifier (SVM is robust for small data)
clf = SVC(kernel='linear')
clf.fit(X_train_real_vec, y_train_real)

# Evaluate
y_pred = clf.predict(X_test_vec)

# Metrics: Use F1-score or MCC because classes are unbalanced!
print("--- Baseline Results ---")
print(classification_report(y_test, y_pred))
#print(f"MCC Score: {matthews_corrcoef(y_test, y_pred)}")

--- Baseline Results ---
              precision    recall  f1-score   support

           0       0.87      1.00      0.93       966
           1       1.00      0.04      0.08       149

    accuracy                           0.87      1115
   macro avg       0.94      0.52      0.50      1115
weighted avg       0.89      0.87      0.82      1115

