Importing neccessary libraries

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import string


Step 1: Load the Dataset

In [11]:
df = pd.read_csv('/content/drive/MyDrive/spam.csv', encoding='latin-1')


Step 2: Preprocess the Dataset

In [12]:
df = df.rename(columns={'v1': 'label', 'v2': 'message'})
df = df[['label', 'message']]


Step 3: Encode Labels

In [13]:
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])


Step 4: Clean the Text Data

In [14]:
def clean_text(text):
    text = str(text).lower()
    return ''.join([char for char in text if char not in string.punctuation])

df['message'] = df['message'].apply(clean_text)


Step 5: Feature Extraction (TF-IDF)

In [15]:
tfidf = TfidfVectorizer(stop_words='english')
X = tfidf.fit_transform(df['message'])
y = df['label']


Step 6: Split Data for Training and Testing

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Step 7: Train Multiple Models

In [17]:
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=42)
}


Step 8: Evaluate Each Model

In [18]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"{name}: Accuracy = {accuracy_score(y_test, y_pred):.4f}")
    #print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))


Naive Bayes: Accuracy = 0.9677
Logistic Regression: Accuracy = 0.9399
Random Forest: Accuracy = 0.9722


Step 9: Predict New SMS Message

In [19]:
sample = ["Congratulations! You've won a free vacation to the Bahamas! Reply now!"]
sample_clean = [clean_text(msg) for msg in sample]
sample_vect = tfidf.transform(sample_clean)
prediction = models["Naive Bayes"].predict(sample_vect)
