# Install required packages:

In [1]:
!pip install plotly nltk scikit-learn pandas numpy matplotlib seaborn



# Setup
First, we import all necessary libraries and set up the environment.

In [24]:
# Standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re
import joblib

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Text Processing Libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc

# Visualization Libraries
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Google Colab specific library for file handling
from google.colab import files

# Download required NLTK data packs for text processing
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('omw-1.4', quiet=True)

print("✅ All libraries imported and NLTK data is ready.")
print("=" * 60)




✅ All libraries imported and NLTK data is ready.


# Data Upload
This block includes a function, so we can upload dataset.csv file directly into the Colab environment.

In [30]:

# Upload your dataset
print("Please upload your 'spam_dataset.csv' file:")
uploaded = files.upload()
file_path = list(uploaded.keys())[0]
print(f"\n📄 '{file_path}' uploaded successfully!")

Please upload your 'spam_dataset.csv' file:


Saving Dataset.csv to Dataset.csv

📄 'Dataset.csv' uploaded successfully!


# **Block 2: Data Loading and Exploratory Data Analysis (EDA)**

Here,  l Ioad the uploaded CSV file into a pandas DataFrame.Then perform a detailed exploration to understand the dataset's structure, class distribution (spam vs. ham), and basic text statistics.

In [31]:
def load_and_explore_data(file_path):
    """Loads dataset and performs comprehensive exploratory data analysis."""
    print("\n DATA LOADING & EXPLORATION")
    print("-" * 40)

    # Load the dataset
    df = pd.read_csv(file_path, encoding='latin-1')

    # Standardize the column names and format
    if df.shape[1] > 2:
        df = df.iloc[:, [0, 1]]
    df.columns = ['label', 'message']
    df['label'] = df['label'].map({'spam': 1, 'ham': 0})

    # Display basic dataset information and a sample
    print(" First 5 rows of the dataset:")
    display(df.head())
    print(f"\n Dataset Shape: {df.shape}")
    print(f" Missing Values: {df.isnull().sum().sum()}")
    print("\n Class Distribution:")
    print(df['label'].value_counts(normalize=True))

    return df

# Execute the function
df = load_and_explore_data(file_path)


 DATA LOADING & EXPLORATION
----------------------------------------
 First 5 rows of the dataset:


Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."



 Dataset Shape: (5572, 2)
 Missing Values: 340

 Class Distribution:
label
0    0.865937
1    0.134063
Name: proportion, dtype: float64


# **Advanced Text Preprocessing and Feature Engineering**
This is a critical step where I clean the raw email text and engineer new, informative features. The AdvancedTextPreprocessor class handles tasks like removing URLs, converting text to lowercase, and lemmatization. It also extracts numerical features like message length and the ratio of capital letters, which are strong indicators of spam.

In [32]:
class AdvancedTextPreprocessor:
    """A class to handle text cleaning and feature extraction."""
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        # Download the missing punkt_tab tokenizer
        try:
            nltk.data.find('tokenizers/punkt_tab')
        except LookupError:
            nltk.download('punkt_tab', quiet=True)


    def extract_features(self, text):
        """Extracts custom numerical features from a single text string."""
        if pd.isna(text):
            text = ""
        # Return a dictionary of features
        return {
            'length': len(text),
            'word_count': len(text.split()),
            'capital_ratio': sum(1 for c in text if c.isupper()) / len(text) if text else 0,
            'digit_count': sum(1 for c in text if c.isdigit()),
            'currency_mentions': len(re.findall(r'[$£€¥₹]|(money|cash|prize|win|free)', text.lower()))
        }

    def clean_text(self, text):
        """Applies a full cleaning pipeline to a single text string."""
        if pd.isna(text):
            return ""
        text = text.lower()
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)
        text = re.sub(r'[^a-zA-Z\s]', ' ', text)
        tokens = word_tokenize(text)
        tokens = [self.lemmatizer.lemmatize(word) for word in tokens if word not in self.stop_words and len(word) > 2]
        return ' '.join(tokens)

def preprocess_data(df):
    """Applies the preprocessing and feature engineering to the entire DataFrame."""
    print("\n ADVANCED TEXT PREPROCESSING & FEATURE ENGINEERING")
    print("-" * 40)

    preprocessor = AdvancedTextPreprocessor()

    # 1. Extract numerical features
    print("Extracting custom numerical features...")
    feature_data = df['message'].apply(preprocessor.extract_features).tolist()
    feature_df = pd.DataFrame(feature_data)

    # 2. Clean the text message
    print("Cleaning text data...")
    df['clean_message'] = df['message'].apply(preprocessor.clean_text)

    # 3. Combine original data with new features
    processed_df = pd.concat([df, feature_df], axis=1)

    print(f" Preprocessing complete! Added {len(feature_df.columns)} custom features.")
    print(" First 5 rows of the processed data:")
    display(processed_df.head())

    return processed_df

# Execute the function
processed_df = preprocess_data(df)


 ADVANCED TEXT PREPROCESSING & FEATURE ENGINEERING
----------------------------------------
Extracting custom numerical features...
Cleaning text data...
 Preprocessing complete! Added 5 custom features.
 First 5 rows of the processed data:


Unnamed: 0,label,message,clean_message,length,word_count,capital_ratio,digit_count,currency_mentions
0,0,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis great world...,111,20,0.027027,0,0
1,0,Ok lar... Joking wif u oni...,lar joking wif oni,29,6,0.068966,0,0
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win cup final tkts may te...,155,28,0.064516,25,2
3,0,U dun say so early hor... U c already then say...,dun say early hor already say,49,11,0.040816,0,0
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think go usf life around though,61,13,0.032787,0,0


# **Model Training with Integrated Pipeline and Hyperparameter Tuning**
This block is the core of my project. Here I define a ColumnTransformer to apply different processing steps to my text and numerical features simultaneously. This is then wrapped in a Pipeline with a classifier. Finally, I use GridSearchCV to test multiple models and their parameters to automatically find the best-performing combination.

In [11]:
print("\n MODEL TRAINING & HYPERPARAMETER TUNING")
print("-" * 40)

# 1. Define feature sets and split the data
X = processed_df
y = processed_df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define which columns are numerical and which is text
numerical_features = ['length', 'word_count', 'capital_ratio', 'digit_count', 'currency_mentions']
text_feature = 'clean_message'

# 2. Create the preprocessing pipeline using ColumnTransformer
# This applies a TF-IDF vectorizer to the text and scales the numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2)), text_feature),
        ('scaler', MinMaxScaler(), numerical_features)
    ],
    remainder='drop' # Drop columns that are not specified
)

# 3. Define models and their hyperparameter grids for tuning
models_to_tune = {
    'Logistic Regression': (LogisticRegression(max_iter=1000, random_state=42), {
        'classifier__C': [0.1, 1.0, 10.0]
    }),
    'Random Forest': (RandomForestClassifier(random_state=42), {
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [10, 20, None]
    }),
    'SVM': (SVC(probability=True, random_state=42), {
        'classifier__C': [1.0, 10.0],
        'classifier__kernel': ['linear', 'rbf']
    })
}

best_estimators = {}
print("Starting hyperparameter tuning for multiple models...")

# 4. Loop through models, create a pipeline, and run GridSearchCV
for name, (model, params) in models_to_tune.items():
    print(f"\n-- Tuning {name} --")

    # Create the full pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

    # Perform grid search with 3-fold cross-validation
    grid_search = GridSearchCV(pipeline, params, cv=3, n_jobs=-1, scoring='accuracy', verbose=1)
    grid_search.fit(X_train, y_train)

    # Store the best performing model
    best_estimators[name] = {
        'estimator': grid_search.best_estimator_,
        'score': grid_search.best_score_,
        'params': grid_search.best_params_
    }

    print(f"Best Score for {name}: {grid_search.best_score_:.4f}")
    print(f"Best Parameters: {grid_search.best_params_}")

# 5. Identify and select the overall best model
best_model_name = max(best_estimators, key=lambda name: best_estimators[name]['score'])
best_model = best_estimators[best_model_name]['estimator']
print(f"\n Overall Best Model: {best_model_name} with accuracy of {best_estimators[best_model_name]['score']:.4f}")


 MODEL TRAINING & HYPERPARAMETER TUNING
----------------------------------------
Starting hyperparameter tuning for multiple models...

-- Tuning Logistic Regression --
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Best Score for Logistic Regression: 0.9744
Best Parameters: {'classifier__C': 10.0}

-- Tuning Random Forest --
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best Score for Random Forest: 0.9735
Best Parameters: {'classifier__max_depth': None, 'classifier__n_estimators': 200}

-- Tuning SVM --
Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best Score for SVM: 0.9769
Best Parameters: {'classifier__C': 1.0, 'classifier__kernel': 'linear'}

 Overall Best Model: SVM with accuracy of 0.9769


 # **Detailed Evaluation of the Best Model**
Now that I have our champion model, I evaluate its performance on the unseen test data. I'll look at the classification report (which includes precision, recall, and F1-score) and the confusion matrix to understand its strengths and weaknesses.

In [12]:
print("\n DETAILED EVALUATION OF THE BEST MODEL")
print("-" * 40)

# Make predictions on the test set
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Display the classification report
print(" Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))

# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(" Confusion Matrix:")
tn, fp, fn, tp = cm.ravel()
print(f"True Negatives (Ham correctly identified): {tn}")
print(f"False Positives (Ham incorrectly marked as Spam): {fp}")
print(f"False Negatives (Spam incorrectly marked as Ham): {fn}")
print(f"True Positives (Spam correctly identified): {tp}")


 DETAILED EVALUATION OF THE BEST MODEL
----------------------------------------
 Classification Report:
              precision    recall  f1-score   support

         Ham       0.98      1.00      0.99       966
        Spam       0.99      0.87      0.92       149

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115

 Confusion Matrix:
True Negatives (Ham correctly identified): 965
False Positives (Ham incorrectly marked as Spam): 1
False Negatives (Spam incorrectly marked as Ham): 20
True Positives (Spam correctly identified): 129


 # **Model Persistence (Saving the Model)**
To use the model in the future without retraining, I save the entire pipeline to a file using joblib. This saved file contains the vectorizer, scaler, and the trained classifier, ready for instant use.

In [13]:
print("\n SAVING THE TRAINED MODEL")
print("-" * 40)

# Save the best model pipeline to a file
model_filename = 'spam_classifier_pipeline.joblib'
joblib.dump(best_model, model_filename)

print(f" Model saved as '{model_filename}'")

# Provide a link to download the model from Colab
files.download(model_filename)


 SAVING THE TRAINED MODEL
----------------------------------------
 Model saved as 'spam_classifier_pipeline.joblib'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

 # **Interactive Visualization Dashboard**
Here, I create a series of interactive plots using Plotly to visualize my results. This dashboard provides deep insights into the data distribution, feature importance, and the final model's performance with charts like the confusion matrix and ROC curve.

In [14]:
print("\n CREATING VISUALIZATION DASHBOARD")
print("-" * 40)

# 1. Confusion Matrix Heatmap
fig_cm = px.imshow(cm,
                   labels=dict(x="Predicted", y="Actual", color="Count"),
                   x=['Ham', 'Spam'], y=['Ham', 'Spam'],
                   text_auto=True, color_continuous_scale='Blues',
                   title=f"Confusion Matrix for {best_model_name}")
fig_cm.show()

# 2. ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

fig_roc = go.Figure()
fig_roc.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f'ROC Curve (AUC = {roc_auc:.3f})'))
fig_roc.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Random Guess', line=dict(dash='dash')))
fig_roc.update_layout(title=f'Receiver Operating Characteristic (ROC) Curve for {best_model_name}',
                      xaxis_title='False Positive Rate',
                      yaxis_title='True Positive Rate')
fig_roc.show()

print(" Interactive dashboard created successfully!")


 CREATING VISUALIZATION DASHBOARD
----------------------------------------


 Interactive dashboard created successfully!


 # **Real-Time Prediction Demo**
Finally, I demonstrate the power of the saved model. i load the joblib file and use it to classify a list of new, unseen emails in real-time. This showcases the practical application of the project.

In [34]:
print("\n REAL-TIME SPAM PREDICTION DEMO")
print("-" * 40)

# Define the model filename (assuming it was saved with this name)
model_filename = 'spam_classifier_pipeline.joblib'

# Load the saved model pipeline
loaded_model = joblib.load(model_filename)
print(" Model loaded successfully from file.")

# Create a sample of test emails
test_emails = [
    "Hi John, let's meet for coffee tomorrow at 3pm",
    "CONGRATULATIONS!!! You've WON $1,000,000 in our weekly prize draw!!! Click here NOW!!!",
    "Your meeting has been rescheduled to next Friday",
    "URGENT: Your bank account has been compromised! Please verify your details immediately!",
    "Happy birthday! Hope you have a wonderful day full of joy and celebration."
]

# Create a DataFrame for prediction, as the pipeline expects it
demo_df = pd.DataFrame(test_emails, columns=['message'])

# Preprocess the demo data (this step is crucial)
demo_df_processed = preprocess_data(demo_df)

# Predict using the loaded model
predictions = loaded_model.predict(demo_df_processed)
probabilities = loaded_model.predict_proba(demo_df_processed)

# Display results
for i, email in enumerate(test_emails):
    pred_label = "Spam" if predictions[i] == 1 else "Ham"
    confidence = probabilities[i][predictions[i]] * 100
    print(f"\nEmail: '{email[:60]}...'")
    print(f"  Prediction: {pred_label} (Confidence: {confidence:.2f}%)")

print("\n\n PROJECT COMPLETE! ")


 REAL-TIME SPAM PREDICTION DEMO
----------------------------------------
 Model loaded successfully from file.

 ADVANCED TEXT PREPROCESSING & FEATURE ENGINEERING
----------------------------------------
Extracting custom numerical features...
Cleaning text data...
 Preprocessing complete! Added 5 custom features.
 First 5 rows of the processed data:


Unnamed: 0,message,clean_message,length,word_count,capital_ratio,digit_count,currency_mentions
0,"Hi John, let's meet for coffee tomorrow at 3pm",john let meet coffee tomorrow,46,9,0.043478,1,0
1,"CONGRATULATIONS!!! You've WON $1,000,000 in ou...",congratulation weekly prize draw click,86,12,0.267442,7,2
2,Your meeting has been rescheduled to next Friday,meeting rescheduled next friday,48,8,0.041667,0,0
3,URGENT: Your bank account has been compromised...,urgent bank account compromised please verify ...,87,12,0.091954,0,0
4,Happy birthday! Hope you have a wonderful day ...,happy birthday hope wonderful day full joy cel...,74,13,0.027027,0,0



Email: 'Hi John, let's meet for coffee tomorrow at 3pm...'
  Prediction: Ham (Confidence: 98.37%)

Email: 'CONGRATULATIONS!!! You've WON $1,000,000 in our weekly prize...'
  Prediction: Spam (Confidence: 98.09%)

Email: 'Your meeting has been rescheduled to next Friday...'
  Prediction: Ham (Confidence: 97.70%)

Email: 'URGENT: Your bank account has been compromised! Please verif...'
  Prediction: Ham (Confidence: 92.19%)

Email: 'Happy birthday! Hope you have a wonderful day full of joy an...'
  Prediction: Ham (Confidence: 99.57%)


 PROJECT COMPLETE! 
