In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
file_path = '/content/drive/My Drive/training/spam.csv'

In [3]:
# Import required libraries
import pandas as pd
import numpy as np
import string
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [4]:
# Load data
df = pd.read_csv(file_path, encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'text']

In [5]:
# Encode labels
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

In [6]:
# Preprocessing function
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text.strip()

df['text'] = df['text'].apply(preprocess)

In [7]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

In [8]:
# TF-IDF
feature_extraction = TfidfVectorizer(stop_words='english')
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [9]:
# Train model
model = MultinomialNB()
model.fit(X_train_features, y_train)

In [10]:
# Evaluate
y_pred = model.predict(X_test_features)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")


Confusion Matrix:
[[965   0]
 [ 35 115]]

Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.77      0.87       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.93      1115
weighted avg       0.97      0.97      0.97      1115

Accuracy: 96.86%


In [11]:
# give own input
print("\n--- Test Your Own Email Message ---")
input_your_mail = input("Enter your email text:\n")
input_your_mail = preprocess(input_your_mail)
input_data_features = feature_extraction.transform([input_your_mail])
prediction = model.predict(input_data_features)

if prediction[0] == 1:
    print("Email is Spam BEWARE!!")
else:
    print("Email is Not Spam")


--- Test Your Own Email Message ---
Enter your email text:
Dear John, I hope this email finds you well. Attached is the report you requested for our upcoming meeting. Please review it at your convenience
Email is Not Spam
