<a href="https://colab.research.google.com/github/tushack/Spam-SMS-Detection/blob/main/Spam_SMS_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [24]:
# Load dataset
df = pd.read_csv('spam.csv', encoding='latin-1')

In [25]:
print(df.head())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [26]:
print(df.columns)

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')


In [27]:
df = df[['v1', 'v2']]
df.columns = ['label', 'message']

In [28]:
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Data Cleaning and Preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9' ]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [29]:
df['message'] = df['message'].apply(clean_text)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.3, random_state=42, stratify=df['label'])

In [31]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [32]:
# Train a Naive Bayes model
model = MultinomialNB(alpha=0.1, class_prior=[0.5, 0.5])
model.fit(X_train_tfidf, y_train)


y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))




Accuracy: 0.9706937799043063

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.98      1448
           1       0.86      0.94      0.90       224

    accuracy                           0.97      1672
   macro avg       0.92      0.96      0.94      1672
weighted avg       0.97      0.97      0.97      1672


Confusion Matrix:
 [[1413   35]
 [  14  210]]


In [33]:
# Test a model
def predict_message(message):
    message_cleaned = clean_text(message)
    message_tfidf = vectorizer.transform([message_cleaned])
    prediction = model.predict(message_tfidf)
    return 'Spam' if prediction[0] == 1 else 'Not Spam'

print(predict_message("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entr..."))

Spam
