In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

nltk.download('stopwords')
nltk.download('wordnet')

!pip install zipfile36

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Collecting zipfile36
  Downloading zipfile36-0.1.3-py3-none-any.whl.metadata (736 bytes)
Downloading zipfile36-0.1.3-py3-none-any.whl (20 kB)
Installing collected packages: zipfile36
Successfully installed zipfile36-0.1.3


In [3]:
!unzip /content/spam.zip

unzip:  cannot find or open /content/spam.zip, /content/spam.zip.zip or /content/spam.zip.ZIP.


In [4]:
data = pd.read_csv('spam.csv',encoding='Windows-1252')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)
data['v1'] = data['v1'].astype(str)
data.dropna(subset=['v1'], inplace=True)
data['v1'] = data['v1'].apply(preprocess_text)


In [5]:
label_encoder = LabelEncoder()
data['v1'] = label_encoder.fit_transform(data['v1'])
X = data['v2']
y = data['v1']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [6]:
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
nb_predictions = nb_model.predict(X_test_tfidf)

In [7]:
lr_model = LogisticRegression()
lr_model.fit(X_train_tfidf, y_train)
lr_predictions = lr_model.predict(X_test_tfidf)

In [8]:
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_tfidf, y_train)
svm_predictions = svm_model.predict(X_test_tfidf)

In [9]:
def evaluate_model(predictions, model_name):
    print(f"{model_name} Model Performance:")
    print(f"Accuracy: {accuracy_score(y_test, predictions):.2f}")
    print(f"Precision: {precision_score(y_test, predictions):.2f}")
    print(f"Recall: {recall_score(y_test, predictions):.2f}")
    print(f"F1 Score: {f1_score(y_test, predictions):.2f}")
    print()
evaluate_model(nb_predictions, "Naive Bayes")
evaluate_model(lr_predictions, "Logistic Regression")
evaluate_model(svm_predictions, "SVM")

Naive Bayes Model Performance:
Accuracy: 0.97
Precision: 1.00
Recall: 0.75
F1 Score: 0.86

Logistic Regression Model Performance:
Accuracy: 0.97
Precision: 0.99
Recall: 0.77
F1 Score: 0.87

SVM Model Performance:
Accuracy: 0.98
Precision: 0.99
Recall: 0.89
F1 Score: 0.93



In [10]:
import joblib

# Assuming you have a trained model (e.g., svm_model) and TF-IDF vectorizer (tfidf)
joblib.dump(svm_model, 'sms_spam_classifier_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [11]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import joblib

# Load the trained model and TF-IDF vectorizer
model = joblib.load('sms_spam_classifier_model.pkl')
tfidf = joblib.load('tfidf_vectorizer.pkl')

# Preprocess function (same as used for training)
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)
def predict_sms(text):
    preprocessed_text = preprocess_text(text)
    text_tfidf = tfidf.transform([preprocessed_text])
    prediction = model.predict(text_tfidf)
    if prediction[0] == 1:
        return "Spam"
    else:
        return "Ham"


In [15]:
user_sms = input("Enter the SMS text: ")
result = predict_sms(user_sms)
print(f"The SMS is classified as: {result}")

Enter the SMS text: Even my brother is not like to speak with me. They treat me like aids patent.
The SMS is classified as: Ham


DATA PREPROCESSING