In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Import NLTK and download stopwords
import nltk
nltk.download("stopwords")

# Other imports for data cleaning
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

# Additional setup for data cleaning
tokill = string.punctuation
sw = stopwords.words('english')
ps = PorterStemmer()

# Step 1: Reading the data
file_path = '/content/drive/MyDrive/SPAM SMS/spam.csv'
df = pd.read_csv(file_path, encoding='latin1')

# Removing unwanted columns
df = df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)

# Renaming Target columns
df.rename({'v1': "Target", 'v2': 'mail'}, inplace=True, axis=1)

# Data cleaning function
def data_cleaning(x):
    wordlist = []
    for word in x.split():
        word = word.lower()
        if word not in sw:
            letterlist = []
            for letter in word:
                if letter not in tokill:
                    letterlist.append(letter)
            stemword = ps.stem("".join(letterlist))
            wordlist.append(stemword)
    x = " ".join(wordlist)
    return x

df["mail"] = df["mail"].apply(data_cleaning)

# Changing targets from categorical to numerical columns
df["Target"] = df["Target"].map({"ham": 0, "spam": 1})

# Training and testing models
xtrain, xtest, ytrain, ytest = train_test_split(df["mail"], df["Target"], test_size=0.3, random_state=42)

# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
xtrain_tfidf = tfidf_vectorizer.fit_transform(xtrain).toarray()
xtest_tfidf = tfidf_vectorizer.transform(xtest).toarray()

# Logistic Regression
lr = LogisticRegression()
lr.fit(xtrain_tfidf, ytrain)
pred_lr = lr.predict(xtest_tfidf)
print("Logistic Regression:")
print("Accuracy:", accuracy_score(ytest, pred_lr))
print("Classification Report:")
print(classification_report(ytest, pred_lr))

# Random Forest
rfc = RandomForestClassifier()
rfc.fit(xtrain_tfidf, ytrain)
pred_rfc = rfc.predict(xtest_tfidf)
print("\nRandom Forest:")
print("Accuracy:", accuracy_score(ytest, pred_rfc))
print("Classification Report:")
print(classification_report(ytest, pred_rfc))

# Naive Bayes
nb = MultinomialNB()
nb.fit(xtrain_tfidf, ytrain)
pred_nb = nb.predict(xtest_tfidf)
print("\nNaive Bayes:")
print("Accuracy:", accuracy_score(ytest, pred_nb))
print("Classification Report:")
print(classification_report(ytest, pred_nb))

# Support Vector Machine
svm = SVC()
svm.fit(xtrain_tfidf, ytrain)

# Predict with SVM
pred_svm = svm.predict(xtest_tfidf)
print("\nSupport Vector Machine:")
print("Accuracy:", accuracy_score(ytest, pred_svm))
print("Classification Report:")
print(classification_report(ytest, pred_svm))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Logistic Regression:
Accuracy: 0.9521531100478469
Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1453
           1       0.97      0.65      0.78       219

    accuracy                           0.95      1672
   macro avg       0.96      0.83      0.88      1672
weighted avg       0.95      0.95      0.95      1672


Random Forest:
Accuracy: 0.9730861244019139
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1453
           1       0.99      0.80      0.89       219

    accuracy                           0.97      1672
   macro avg       0.98      0.90      0.94      1672
weighted avg       0.97      0.97      0.97      1672


Naive Bayes:
Accuracy: 0.9599282296650717
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1453
           1       1.00      0