In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')

In [4]:
df.sample(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
3990,ham,I am on the way to ur home,,,
4597,ham,I'm stuck in da middle of da row on da right h...,,,
3406,ham,Beautiful Truth against Gravity.. Read careful...,,,
3021,ham,How dare you change my ring,,,
1493,ham,How are you with moneY...as in to you...money ...,,,


In [5]:
df.shape

(5572, 5)

In [6]:
# 1. Data cleaning
# 2. EDA
# 3. Text Preprocessing
# 4. Model building
# 5. Evaluation
# 6. Improvement
# 7. Website
# 8. Deploy

1. Data Cleaning
Data cleaning is the process of identifying and correcting errors, inconsistencies, and inaccuracies in a dataset.


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [8]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [10]:
df.rename(columns={'v1':'target','v2':'message'},inplace=True)
df.sample(5)

Unnamed: 0,target,message
2111,ham,Yar he quite clever but aft many guesses lor. ...
2313,ham,So what do you guys do.
2423,ham,Lmao but its so fun...
1788,ham,Arun can u transfr me d amt
4817,ham,How's ur paper?


In [14]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Load your dataset (Ensure it has "message" and "target" columns)
# The dataframe 'df' is already loaded and available in the notebook

# Verify columns
print(df.head())  
df.rename(columns={'v1':'target','v2':'message'},inplace=True)

# Ensure correct column names
df = df[['message', 'target']]  # Keep only required columns

# Convert target to integer type if necessary
df['target'] = df['target'].map({'ham': 0, 'spam': 1}).astype(int)

# Text Preprocessing Function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    words = text.split()
    words = [word for word in words if word not in stopwords.words('english')]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatization
    return ' '.join(words)

# Apply preprocessing
df['message'] = df['message'].apply(preprocess_text)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['target'], test_size=0.2, random_state=42)

# Create a Text Classification Pipeline
model = Pipeline([
    ('vectorizer', CountVectorizer()),  # Convert text to word count vectors
    ('tfidf', TfidfTransformer()),  # Apply TF-IDF transformation
    ('classifier', MultinomialNB())  # Train using Naïve Bayes
])

# Train the Model
model.fit(X_train, y_train)

# Model Evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Function for Real-time Spam Detection
def predict_spam(message):
    message = preprocess_text(message)
    prediction = model.predict([message])[0]
    return "Spam" if prediction == 1 else "Not Spam"

# Test User Input
user_message = "You have won a free iPhone! Click here to claim."
print(f"Message: {user_message} → Prediction: {predict_spam(user_message)}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vansh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vansh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                             message target
0  Go until jurong point, crazy.. Available only ...    ham
1                      Ok lar... Joking wif u oni...    ham
2  Free entry in 2 a wkly comp to win FA Cup fina...   spam
3  U dun say so early hor... U c already then say...    ham
4  Nah I don't think he goes to usf, he lives aro...    ham
Accuracy: 0.9659192825112107

Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.85       150

    accuracy                           0.97      1115
   macro avg       0.98      0.87      0.92      1115
weighted avg       0.97      0.97      0.96      1115

Message: You have won a free iPhone! Click here to claim. → Prediction: Spam
