# SMS Spam Detection: NLP Text Classification

**Author:** Tharun Ponnam  
**Email:** tharunponnam007@gmail.com  
**Dataset:** UCI SMS Spam Collection

This notebook demonstrates a complete text classification pipeline using:
- **NLTK** for natural language processing
- **Scikit-Learn** for machine learning models
- **Pandas** for data manipulation

---

## 1. Environment Setup

In [None]:
import sys
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

sys.path.insert(0, '..')
from src.classifier import WordsClassifier, compare_models
from src.data_loader import DataLoader
from src.preprocessing import TextPreprocessor
from src.feature_extraction import FeatureExtractor
from src.model_trainer import ModelTrainer
from src.visualization import (
    plot_confusion_matrix, plot_roc_curve, plot_class_distribution,
    plot_text_length_distribution, plot_wordcloud, plot_model_comparison
)

plt.style.use('seaborn-v0_8-whitegrid')
pd.set_option('display.max_colwidth', 100)

print("Environment ready!")

## 2. Data Loading & Exploration

In [None]:
df = DataLoader.load_sms_spam()

print(f"Dataset Shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nData Types:\n{df.dtypes}")

In [None]:
df.head(10)

In [None]:
stats = DataLoader.get_dataset_stats(df)

print("Dataset Statistics:")
print(f"  Total Samples: {stats['total_samples']:,}")
print(f"  Classes: {stats['n_classes']}")
print(f"\nLabel Distribution:")
for label, count in stats['label_distribution'].items():
    label_name = 'Ham' if label == 0 else 'Spam'
    pct = count / stats['total_samples'] * 100
    print(f"  {label_name}: {count:,} ({pct:.1f}%)")

print(f"\nText Length (characters):")
print(f"  Mean: {stats['text_length']['mean']:.1f}")
print(f"  Std:  {stats['text_length']['std']:.1f}")
print(f"  Min:  {stats['text_length']['min']}")
print(f"  Max:  {stats['text_length']['max']}")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

plot_class_distribution(
    df['label'].values,
    class_names=['Ham', 'Spam'],
    title='Class Distribution'
)
plt.subplot(1, 2, 1)
colors = ['#3b82f6', '#ef4444']
counts = df['label'].value_counts().sort_index()
plt.bar(['Ham', 'Spam'], counts.values, color=colors)
plt.title('Class Distribution', fontsize=14, fontweight='bold')
plt.ylabel('Count')
for i, v in enumerate(counts.values):
    plt.text(i, v + 50, f'{v:,}', ha='center', fontweight='bold')

plt.subplot(1, 2, 2)
df['text_length'] = df['text'].str.len()
df[df['label'] == 0]['text_length'].hist(bins=50, alpha=0.6, label='Ham', color='#3b82f6')
df[df['label'] == 1]['text_length'].hist(bins=50, alpha=0.6, label='Spam', color='#ef4444')
plt.xlabel('Text Length (characters)')
plt.ylabel('Frequency')
plt.title('Text Length Distribution by Class', fontsize=14, fontweight='bold')
plt.legend()

plt.tight_layout()
plt.savefig('../assets/screenshots/data_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

## 3. Text Preprocessing

In [None]:
preprocessor = TextPreprocessor(
    remove_stopwords=True,
    use_lemmatization=True
)

sample_text = "URGENT! You have WON a $1000 prize. Call NOW at 1-800-555-1234 to claim!!!"

print("Original Text:")
print(f"  {sample_text}")
print(f"\nCleaned Text:")
print(f"  {preprocessor.clean_text(sample_text)}")
print(f"\nTokens:")
print(f"  {preprocessor.tokenize(sample_text)}")
print(f"\nProcessed:")
print(f"  {preprocessor.preprocess(sample_text)}")

In [None]:
print("\nPOS Tags:")
pos_tags = preprocessor.get_pos_tags(sample_text)
for token, tag in pos_tags:
    print(f"  {token:15} -> {tag}")

In [None]:
df['processed_text'] = preprocessor.batch_preprocess(df['text'].tolist(), show_progress=True)

In [None]:
df[['text', 'processed_text', 'label']].head()

## 4. Feature Extraction

In [None]:
extractor = FeatureExtractor(
    vectorizer_type='tfidf',
    max_features=5000,
    ngram_range=(1, 2),
    min_df=2
)

features = extractor.fit_transform(df['processed_text'].tolist())

print(f"Feature Matrix Shape: {features.shape}")
print(f"Vocabulary Size: {extractor.vocabulary_size:,}")
print(f"Sparsity: {1 - features.nnz / (features.shape[0] * features.shape[1]):.2%}")

In [None]:
top_features = extractor.get_top_features(features, df['label'].values, n_top=15)

print("Top Features by Class:\n")
for label, feats in top_features.items():
    label_name = 'Ham' if label == 0 else 'Spam'
    print(f"{label_name}:")
    for feat, score in feats[:10]:
        print(f"  {feat:20} {score:.4f}")
    print()

## 5. Model Training & Evaluation

In [None]:
splits = DataLoader.split_data(df, test_size=0.2, random_state=42)
X_train, y_train = splits['train']
X_test, y_test = splits['test']

print(f"Training samples: {len(X_train):,}")
print(f"Test samples: {len(X_test):,}")

### 5.1 Naive Bayes Classifier

In [None]:
nb_classifier = WordsClassifier(
    model_type='naive_bayes',
    vectorizer_type='tfidf',
    max_features=5000
)

nb_classifier.fit(X_train.tolist(), y_train.values)

nb_metrics = nb_classifier.evaluate(X_test.tolist(), y_test.values)

### 5.2 Support Vector Machine

In [None]:
svm_classifier = WordsClassifier(
    model_type='svm',
    vectorizer_type='tfidf',
    max_features=5000
)

svm_classifier.fit(X_train.tolist(), y_train.values)

svm_metrics = svm_classifier.evaluate(X_test.tolist(), y_test.values)

### 5.3 Random Forest

In [None]:
rf_classifier = WordsClassifier(
    model_type='random_forest',
    vectorizer_type='tfidf',
    max_features=5000,
    n_estimators=100
)

rf_classifier.fit(X_train.tolist(), y_train.values)

rf_metrics = rf_classifier.evaluate(X_test.tolist(), y_test.values)

### 5.4 Model Comparison

In [None]:
results = compare_models(
    df['text'].tolist(),
    df['label'].values,
    models=['naive_bayes', 'complement_nb', 'svm', 'logistic', 'random_forest'],
    cv=5
)

In [None]:
fig = plot_model_comparison(
    results,
    title='Model Performance Comparison (5-Fold CV)'
)
plt.savefig('../assets/screenshots/model_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Best Model Analysis

In [None]:
best_model = results.iloc[0]['model']
print(f"Best Model: {best_model}")

classifier = WordsClassifier(model_type=best_model)
classifier.fit(X_train.tolist(), y_train.values)

y_pred = classifier.predict(X_test.tolist())
y_proba = classifier.predict_proba(X_test.tolist())[:, 1]

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

plt.subplot(1, 2, 1)
cm = confusion_matrix(y_test.values, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix', fontsize=14, fontweight='bold')

plt.subplot(1, 2, 2)
from sklearn.metrics import roc_curve, auc
fpr, tpr, _ = roc_curve(y_test.values, y_proba)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, color='#2563eb', lw=2, label=f'ROC (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], '--', color='gray')
plt.fill_between(fpr, tpr, alpha=0.2, color='#2563eb')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve', fontsize=14, fontweight='bold')
plt.legend(loc='lower right')

plt.tight_layout()
plt.savefig('../assets/screenshots/model_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

## 7. Word Clouds

In [None]:
ham_texts = df[df['label'] == 0]['processed_text'].tolist()
spam_texts = df[df['label'] == 1]['processed_text'].tolist()

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

plt.subplot(1, 2, 1)
from wordcloud import WordCloud
wc_ham = WordCloud(width=800, height=400, background_color='white',
                   colormap='Greens', max_words=100).generate(' '.join(ham_texts[:1000]))
plt.imshow(wc_ham, interpolation='bilinear')
plt.axis('off')
plt.title('Ham Messages', fontsize=16, fontweight='bold')

plt.subplot(1, 2, 2)
wc_spam = WordCloud(width=800, height=400, background_color='white',
                    colormap='Reds', max_words=100).generate(' '.join(spam_texts))
plt.imshow(wc_spam, interpolation='bilinear')
plt.axis('off')
plt.title('Spam Messages', fontsize=16, fontweight='bold')

plt.tight_layout()
plt.savefig('../assets/screenshots/word_clouds.png', dpi=150, bbox_inches='tight')
plt.show()

## 8. Real-time Prediction Demo

In [None]:
test_messages = [
    "Congratulations! You've won a free vacation. Call now to claim!",
    "Hey, can we meet for coffee tomorrow at 3pm?",
    "URGENT: Your bank account has been compromised. Click here immediately.",
    "Thanks for your help with the project. Really appreciate it!",
    "FREE ENTRY! Win Â£1000 cash. Text WIN to 80808 now!"
]

print("Real-time Classification Demo\n" + "="*50)

for msg in test_messages:
    result = classifier.predict_with_confidence([msg])[0]
    label = "SPAM ðŸš¨" if result['label'] == 1 else "HAM âœ…"
    conf = result['confidence'] * 100
    
    print(f"\nMessage: {msg[:60]}...")
    print(f"  â†’ {label} (Confidence: {conf:.1f}%)")

## 9. Save Best Model

In [None]:
classifier.save('../models/best_classifier.pkl')
print("Model saved successfully!")

In [None]:
loaded_classifier = WordsClassifier.load('../models/best_classifier.pkl')

test_msg = "Free prize waiting for you!"
pred = loaded_classifier.predict([test_msg])[0]
print(f"Test prediction: {'SPAM' if pred == 1 else 'HAM'}")

---

## Summary

This project demonstrates a complete text classification pipeline:

1. **Data Loading**: UCI SMS Spam Collection (5,574 messages)
2. **Preprocessing**: Tokenization, lemmatization, stopword removal
3. **Feature Extraction**: TF-IDF with n-grams
4. **Model Training**: Multiple algorithms compared
5. **Evaluation**: Cross-validation, confusion matrix, ROC curves

**Best Model Performance:**
- Accuracy: ~97%
- F1-Score: ~96%
- ROC-AUC: ~99%

---

**Author:** Tharun Ponnam  
**GitHub:** [@tharun-ship-it](https://github.com/tharun-ship-it)