In [1]:
!pip install gensim



In [2]:
import gensim

In [3]:
from gensim.models import Word2Vec, KeyedVectors

In [4]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [5]:
import pandas as pd

In [6]:
df = pd.read_csv("SMSSpamCollection.csv", sep="\t", names=["label", "message"])

In [7]:
df.shape

(5572, 2)

In [8]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
df.tail()

Unnamed: 0,label,message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [10]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [11]:
from nltk.stem import WordNetLemmatizer

In [12]:
lemmatizer = WordNetLemmatizer()

In [13]:
import re
from nltk.corpus import stopwords

In [14]:
stop_words = set(stopwords.words("english"))

In [19]:
from nltk import word_tokenize

In [20]:
def preprocess_text(text):
  """
  Text preprocessing
  """
  text = str(text).lower()
  # Remove special case chars but keep the spaces
  text = re.sub(r'[^a-zA-Z\s]', ' ', text)

  # Remove extra white spaces
  text = re.sub(r'\s+', ' ', text).strip()

  # Tokenize
  tokens = word_tokenize(text)

  tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 2]

  return tokens

In [21]:
processed_texts = []

In [22]:
for i in range(len(df)):
  tokens = preprocess_text(df['message'][i])
  processed_texts.append(tokens)

In [25]:
processed_texts

[['jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'great',
  'world',
  'buffet',
  'cine',
  'got',
  'amore',
  'wat'],
 ['lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'wkly',
  'comp',
  'win',
  'cup',
  'final',
  'tkts',
  'may',
  'text',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply'],
 ['dun', 'say', 'early', 'hor', 'already', 'say'],
 ['nah', 'think', 'go', 'usf', 'life', 'around', 'though'],
 ['freemsg',
  'hey',
  'darling',
  'week',
  'word',
  'back',
  'like',
  'fun',
  'still',
  'xxx',
  'std',
  'chgs',
  'send',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'melle',
  'melle',
  'oru',
  'minnaminunginte',
  'nurungu',
  'vettam',
  'set',
  'callertune',
  'caller',
  'press',
  'copy',
  'friend',
  'callertune'],
 ['winner',
  'valued',
  'network',
  'customer',
  'selected',
  'receivea',
  'prize',
  'reward',
  'claim',
  'call',
  'claim',
  'code'

In [26]:
# Remove empty docs
valid_indices = [i for i, tokens in enumerate(processed_texts) if len(tokens) > 0]
processed_texts = [processed_texts[i] for i in valid_indices]
df_filtered = df.iloc[valid_indices].reset_index(drop=True)

In [27]:
df_filtered

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5527,spam,This is the 2nd time we have tried 2 contact u...
5528,ham,Will ü b going to esplanade fr home?
5529,ham,"Pity, * was in mood for that. So...any other s..."
5530,ham,The guy did some bitching but I acted like i'd...


In [28]:
import numpy as np

In [29]:
def avg_word2vec_pretrained(tokens):
  """
  Calculate average word2vec using pre-trained model with better OOV handling
  """
  vectors = []
  for word in tokens:
    if word in wv:
      vectors.append(wv[word])
  if vectors:
    return np.mean(vectors, axis=0)
  else:
    # Return zero vector for documents with no known words
    return np.zeros(300)

In [30]:
# Additional feature extraction
def extract_additional_features(text):
  """
  Extract additional features from raw text
  """
  features = {}
  features['length'] = len(text)
  features['num_words'] = len(text.split())
  features['num_uppercase'] = sum(1 for c in text if c.isupper())
  features['num_digits'] = sum(1 for c in text if c.isdigit())
  features['num_special_chars'] = len(re.findall(r'[!@#$%^&*(),.?":{}|<>]', text))
  features['has_url'] = 1 if 'http' in text.lower() or 'www' in text.lower() else 0
  features['has_money_symbol'] = 1 if any(symbol in text for symbol in ['$', '£', '€', '₹']) else 0
  features['exclamation_ratio'] = text.count('!') / len(text) if len(text) > 0 else 0

  return list(features.values())

In [31]:
from tqdm import tqdm

In [34]:
# Create word2vec features

In [32]:
X_w2v = []
for tokens in tqdm(processed_texts, desc = "Processing word2vec"):
  X_w2v.append(avg_word2vec_pretrained(tokens=tokens))

Processing word2vec: 100%|██████████| 5532/5532 [00:00<00:00, 16354.56it/s]


In [33]:
X_w2v = np.array(X_w2v)

In [35]:
# Create additional features
X_additional = []
for i in tqdm(range(len(df_filtered)), desc="Processing additional features"):
    additional_features = extract_additional_features(df_filtered["message"].iloc[i])
    X_additional.append(additional_features)

Processing additional features: 100%|██████████| 5532/5532 [00:00<00:00, 41016.10it/s]


In [36]:
X_additional = np.array(X_additional)

In [37]:
# Combine all features
X_combined = np.hstack([X_w2v, X_additional])

In [38]:
X_combined.shape

(5532, 308)

In [41]:
# Prepare target variable
y = (df_filtered["label"] == "spam").astype(int)

In [42]:
# checking the target distribution
np.bincount(y)

array([4786,  746])

In [43]:
# Checking for NaN values
nan_count = np.isnan(X_combined).sum()
if nan_count > 0:
  print(f"Found {nan_count} NaN values, replacing with zeros...")
  X_combined = np.nan_to_num(X_combined)

In [44]:
# Split the data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_combined, y, test_size=0.2, random_state=42, stratify=y # use stratify=y to maintain class distribution in train/test split
)

In [45]:
# Scale the features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [46]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [47]:
# Try multiple classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [48]:
classifiers = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'SVM': SVC(kernel='rbf', random_state=42, probability=True)
}

In [49]:
results = {}

In [50]:
for name, classifier in classifiers.items():
  print(f"\n--- {name} ---")

  # Train
  classifier.fit(X_train_scaled, y_train)

  # Predict
  y_pred = classifier.predict(X_test_scaled)

  # Evaluate
  accuracy = accuracy_score(y_test, y_pred)
  results[name] = accuracy

  print(f"Accuracy: {accuracy:.4f}")
  print("\nClassification Report:")
  print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))


--- Random Forest ---
Accuracy: 0.9828

Classification Report:
              precision    recall  f1-score   support

         Ham       0.98      1.00      0.99       958
        Spam       0.99      0.88      0.93       149

    accuracy                           0.98      1107
   macro avg       0.99      0.94      0.96      1107
weighted avg       0.98      0.98      0.98      1107


--- Logistic Regression ---
Accuracy: 0.9756

Classification Report:
              precision    recall  f1-score   support

         Ham       0.99      0.98      0.99       958
        Spam       0.89      0.94      0.91       149

    accuracy                           0.98      1107
   macro avg       0.94      0.96      0.95      1107
weighted avg       0.98      0.98      0.98      1107


--- SVM ---
Accuracy: 0.9901

Classification Report:
              precision    recall  f1-score   support

         Ham       0.99      1.00      0.99       958
        Spam       0.99      0.94      0.96      

In [51]:
print("\n" + "="*50)
print("FINAL RESULTS SUMMARY")
print("="*50)
for name, accuracy in results.items():
    print(f"{name}: {accuracy:.4f}")


FINAL RESULTS SUMMARY
Random Forest: 0.9828
Logistic Regression: 0.9756
SVM: 0.9901


In [52]:
# Select best model and show confusion matrix
best_model_name = max(results, key=results.get)
best_model = classifiers[best_model_name]

In [53]:
print(f"\nBest Model: {best_model_name} (Accuracy: {results[best_model_name]:.4f})")


Best Model: SVM (Accuracy: 0.9901)


In [54]:
# Retrain best model and show confusion matrix
best_model.fit(X_train_scaled, y_train)
y_pred_best = best_model.predict(X_test_scaled)

print("\nConfusion Matrix for Best Model:")
cm = confusion_matrix(y_test, y_pred_best)
print(cm)


Confusion Matrix for Best Model:
[[956   2]
 [  9 140]]


In [55]:
# Feature importance (if available)
if hasattr(best_model, 'feature_importances_'):
    print(f"\nTop 10 Most Important Features ({best_model_name}):")
    feature_names = [f'w2v_{i}' for i in range(300)] + [
        'length', 'num_words', 'num_uppercase', 'num_digits',
        'num_special_chars', 'has_url', 'has_money_symbol', 'exclamation_ratio'
    ]

    feature_importance = list(zip(feature_names, best_model.feature_importances_))
    feature_importance.sort(key=lambda x: x[1], reverse=True)

    for feature, importance in feature_importance[:10]:
        print(f"{feature}: {importance:.4f}")

In [56]:
# Test on some sample messages
def predict_message(text, model=best_model, scaler=scaler):
  """
  Predict if a message is spam or ham
  """
  tokens = preprocess_text(text)
  w2v_features = avg_word2vec_pretrained(tokens=tokens)
  additional_features = extract_additional_features(text)
  combined_features = np.hstack([w2v_features, additional_features])
  combined_features_scaled = scaler.transform(combined_features.reshape(1, -1))

  # Predict
  prediction = model.predict(combined_features_scaled)[0]
  probability = model.predict_proba(combined_features_scaled)[0]

  return "Spam" if prediction == 1 else "Ham", probability

In [57]:
test_messages = [
    "Free! call 08712460324 now! claim your £150 worth of discount vouchers!",
    "Hey, are you free for lunch today?",
    "URGENT! Win £1000 cash! Call now!",
    "Can you pick up milk on your way home?"
]

In [58]:
print("\n" + "="*50)
print("TESTING ON SAMPLE MESSAGES")
print("="*50)

for msg in test_messages:
    prediction, prob = predict_message(msg)
    print(f"\nMessage: '{msg[:50]}...' " if len(msg) > 50 else f"\nMessage: '{msg}'")
    print(f"Prediction: {prediction}")
    print(f"Probabilities: Ham={prob[0]:.3f}, Spam={prob[1]:.3f}")


TESTING ON SAMPLE MESSAGES

Message: 'Free! call 08712460324 now! claim your £150 worth ...' 
Prediction: Spam
Probabilities: Ham=0.000, Spam=1.000

Message: 'Hey, are you free for lunch today?'
Prediction: Ham
Probabilities: Ham=0.985, Spam=0.015

Message: 'URGENT! Win £1000 cash! Call now!'
Prediction: Spam
Probabilities: Ham=0.149, Spam=0.851

Message: 'Can you pick up milk on your way home?'
Prediction: Ham
Probabilities: Ham=0.999, Spam=0.001
