<a href="https://colab.research.google.com/github/zaintariq967/skills-introduction-to-github/blob/main/EMAIL_SPAM_PROJECT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Basic Data Handling
import pandas as pd
import numpy as np

# Regular Expressions (for text cleaning)
import re

# Natural Language Processing
import nltk
from nltk.corpus import stopwords

# Machine Learning Models
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

# Model Evaluation
from sklearn.metrics import classification_report, confusion_matrix

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns


In [4]:
import pandas as pd
data=pd.read_csv('spam_Emails_data.csv')
print(data.head())
print(data.columns)
print(data.isnull().sum())
print(data['label'].value_counts())

  label                                               text
0  Spam  viiiiiiagraaaa\nonly for the ones that want to...
1   Ham  got ice thought look az original message ice o...
2  Spam  yo ur wom an ne eds an escapenumber in ch ma n...
3  Spam  start increasing your odds of success & live s...
4   Ham  author jra date escapenumber escapenumber esca...
Index(['label', 'text'], dtype='object')
label    0
text     0
dtype: int64
label
Ham     3668
Spam    3259
Name: count, dtype: int64


In [5]:
# Cleaning the data (full cleaning)
import re
import pandas as pd
import numpy as np

def clean_text(text):
    # Check if the value is NaN or not a string
    if isinstance(text, float) and np.isnan(text):
        return ""

    # Convert to string if it's not already
    text = str(text)

    # Lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    # Remove punctuations and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply the function and handle NaN values
data['clean_text'] = data['text'].apply(clean_text)

In [6]:
# Removing stop keywords
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

data['clean_text'] = data['clean_text'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
# Text feature extraction TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)  # Take top 5000 most important words
X_text = tfidf.fit_transform(data['clean_text']).toarray()

In [9]:
#  now extract structural patterns like: these are features extracted

# 1) Email Length (more words = might be spam)
# Convert non-string entries to string or empty string
data['email_length'] = data['text'].fillna('').astype(str).apply(len)

# 2) Number of Links (spams often have lots of links)
import re

def count_links(text):
    if isinstance(text, str):
        return len(re.findall(r'http\S+', text))
    return 0

data['num_links'] = data['text'].apply(count_links)

# 3) Capital Letter Ratio (spams often SHOUT)
def capital_ratio(text):
    if not isinstance(text, str):
        text = str(text) if text is not None else ''
    capitals = sum(1 for c in text if c.isupper())
    return capitals / max(1, len(text))

data['capital_ratio'] = data['text'].apply(capital_ratio)

# 4) Presence of Spammy Words ("FREE", "WIN", "URGENT", etc.)
def has_spammy_words(text):
    spammy = ['free', 'win', 'guarantee', 'winner', 'urgent']
    text = str(text).lower()
    return int(any(word in text for word in spammy))

data['spammy_words'] = data['text'].apply(has_spammy_words)

In [10]:
# Now Merging all 4 features
import numpy as np

# Ensure no NaNs in structural features : safe handling data
data[['email_length', 'num_links', 'capital_ratio', 'spammy_words']] = \
    data[['email_length', 'num_links', 'capital_ratio', 'spammy_words']].fillna(0)

# Extract structural features as a NumPy array
X_structural = data[['email_length', 'num_links', 'capital_ratio', 'spammy_words']].values

# Merge TF-IDF text features with structural features
X_final = np.hstack((X_text, X_structural))   # complete feature set for machine learning features + all data

In [11]:
# prepare labels
y_final = data['label'].map({'ham': 0, 'spam': 1})  # Map ham -> 0, spam -> 1

In [12]:
#print(y_final)
print(data['label'].isna().sum())  # This will show the number of NaN values in your labels column

0


In [13]:
# print(X_final.shape)
# print(data['label'].shape)  # Check the shape of the label column
# Remove rows where 'label' column has NaN values

# Remove rows where 'label' is NaN
data = data.dropna(subset=['label'])

# Reset the index to keep things clean (optional but recommended)
data = data.reset_index(drop=True)

# Now extract labels
y_final = data['label'].values

In [14]:
# check if any NaNs remain
print(data['label'].isnull().sum())  # Correct way for object/string types

0


In [15]:
# check sahpe of X and y final
print("X_final shape:", X_final.shape)
print("y_final shape:", y_final.shape)

X_final shape: (6927, 5004)
y_final shape: (6927,)


In [16]:
# cheching features of first mail
print("First email features (X_final):", X_final[0])

First email features (X_final): [0. 0. 0. ... 0. 0. 0.]


In [17]:
print(data['label'].unique())
data['label'] = data['label'].str.strip().str.lower()
y_final = data['label'].map({'ham': 0, 'spam': 1})

['Spam' 'Ham']


In [18]:
# check label values
print("First 10 labels:", y_final[:10].values)

First 10 labels: [1 0 1 1 0 1 0 0 0 0]


In [19]:
print(y_final.value_counts())

label
0    3668
1    3259
Name: count, dtype: int64


In [20]:
print(X_final[10])

[0. 0. 0. ... 0. 0. 1.]


In [21]:
# saving cleaned data after data preprocessing
data.to_csv('cleaned_dataset.csv', index=False)

In [22]:
# Train-Test Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.2, random_state=42)
# 80% training 20% testing

In [42]:
# Now Training ML Models

# 1) Naive Bayes
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

In [24]:
# 2) Logistic Regression
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [37]:
# 3 ) SVM
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# Scaling the features
scaler = StandardScaler(with_mean=False)  # with_mean=False for sparse matrices
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Choose SVM model (LinearSVC for linear kernel or SVC for other kernels)
svm_model = LinearSVC(class_weight='balanced', max_iter=5000)  # For linear kernel
# If you want to use the SVC with RBF or other kernels, uncomment the following:
# svm_model = SVC(class_weight='balanced', max_iter=5000)

# Fit the model using the scaled data
svm_model.fit(X_train_scaled, y_train)

In [32]:
# 4)  Random Forest
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


In [61]:
# Comparing model results
from sklearn.metrics import classification_report, confusion_matrix

models = [nb_model, lr_model,svm_model, rf_model]
names = ['Naive Bayes', 'Logistic Regression','SVM', 'Random Forest']


for model, name in zip(models, names):
    y_pred = model.predict(X_test)
    print(f"Results for {name}:")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    print("\n")


Results for Naive Bayes:
              precision    recall  f1-score   support

           0       0.80      0.43      0.56       747
           1       0.57      0.87      0.69       639

    accuracy                           0.63      1386
   macro avg       0.68      0.65      0.62      1386
weighted avg       0.69      0.63      0.62      1386

[[320 427]
 [ 81 558]]


Results for Logistic Regression:
              precision    recall  f1-score   support

           0       0.95      0.96      0.96       747
           1       0.95      0.95      0.95       639

    accuracy                           0.95      1386
   macro avg       0.95      0.95      0.95      1386
weighted avg       0.95      0.95      0.95      1386

[[715  32]
 [ 35 604]]


Results for SVM:
              precision    recall  f1-score   support

           0       0.54      1.00      0.70       747
           1       0.67      0.00      0.01       639

    accuracy                           0.54      1386
   

In [62]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Predict using best model (Logistic Regression)
best_model = lr_model
y_pred = best_model.predict(X_test)

# Display results
print("Final Results for Best Model: Logistic Regression")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Final Results for Best Model: Logistic Regression
Accuracy: 0.9517
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.96      0.96       747
           1       0.95      0.95      0.95       639

    accuracy                           0.95      1386
   macro avg       0.95      0.95      0.95      1386
weighted avg       0.95      0.95      0.95      1386

Confusion Matrix:
[[715  32]
 [ 35 604]]


In [63]:
# Now deploy Best Model to Gradio
!pip install gradio --quiet

[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m54.1/54.1 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m322.9/322.9 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m95.2/95.2 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m11.5/11.5 MB[0m [31m102.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m72.0/72.0 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K 

In [69]:
import gradio as gr

# Assuming you already trained:
# - vectorizer (e.g., TfidfVectorizer or CountVectorizer)
# - Logistic Regression model (lr_model)

# Sample preprocessing + prediction function
def predict_spam(email_text):
    # Vectorize input text
    vectorized_text = vectorizer.transform([email_text])
    prediction = lr_model.predict(vectorized_text)[0]

    # Convert prediction to label
    label = "Spam" if prediction == 1 else "Not Spam"
    return label

In [71]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import joblib

# Example dataset
emails = ["Win a FREE vacation now", "Meeting scheduled at 10 AM", "Claim your prize!!!", "Let's review the document"]
labels = [1, 0, 1, 0]  # 1 = spam, 0 = not spam

# Vectorize
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(emails)

# Train
model = LogisticRegression()
model.fit(X, labels)

# Save model and vectorizer
joblib.dump(vectorizer, "vectorizer.pkl")
joblib.dump(model, "best_model.pkl")

['best_model.pkl']

In [72]:
import gradio as gr
import joblib  # or pickle
import numpy as np

# Load your trained vectorizer and model
vectorizer = joblib.load("vectorizer.pkl")
model = joblib.load("best_model.pkl")  # e.g., your Logistic Regression model

def predict_email(email_text):
    try:
        # Preprocess and vectorize the input
        X = vectorizer.transform([email_text])

        # Predict
        pred = model.predict(X)[0]

        return "Spam" if pred == 1 else "Not Spam"

    except Exception as e:
        return f"Error: {str(e)}"

# Gradio UI
iface = gr.Interface(
    fn=predict_email,
    inputs=gr.Textbox(lines=10, label="Enter an email message"),
    outputs="text",
    title="üìß Spam Email Classifier",
    description="Enter an email message to detect if it's Spam or Not Spam."
)

iface.launch()

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://d4928f6a1a43a6a66c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


