In [1]:
import numpy as np
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

In [3]:
df = pd.read_csv('spam.csv', encoding = "latin-1")

df = df.iloc[:, :2]   
df.columns = ["label", "message"]

df["label"]  = df["label"].map({"ham":0, "spam":1})

print(df.head())
print(df["label"].value_counts())

   label                                            message
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...
label
0    4825
1     747
Name: count, dtype: int64


#### Text Preprocessing

In [4]:
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sunny\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sunny\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    words = text.split()
    words = [word for word in words if word not in stopwords.words("english")]
    return " ".join(words)

df["cleaned_text"] = df["message"].apply(preprocess_text)
print(df.head())

   label                                            message  \
0      0  Go until jurong point, crazy.. Available only ...   
1      0                      Ok lar... Joking wif u oni...   
2      1  Free entry in 2 a wkly comp to win FA Cup fina...   
3      0  U dun say so early hor... U c already then say...   
4      0  Nah I don't think he goes to usf, he lives aro...   

                                        cleaned_text  
0  go jurong point crazy available bugis n great ...  
1                            ok lar joking wif u oni  
2  free entry 2 wkly comp win fa cup final tkts 2...  
3                u dun say early hor u c already say  
4        nah dont think goes usf lives around though  


#### Text Classification using Bag of Words (BoW)

In [6]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(df["cleaned_text"], df["label"], test_size=0.2, random_state=42)

# Create a pipeline (BoW + Naive Bayes)
pipeline_bow = Pipeline([
    ("vectorizer", CountVectorizer()),
    ("classifier", MultinomialNB())
])

# Train model
pipeline_bow.fit(X_train, y_train)

# Evaluate
y_pred = pipeline_bow.predict(X_test)
print("Accuracy (BoW):", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy (BoW): 0.9802690582959641
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.97      0.88      0.92       150

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



#### Text Classification using TF-IDF

In [7]:
# Create a pipeline (TF-IDF + Naive Bayes)
pipeline_tfidf = Pipeline([
    ("vectorizer", TfidfVectorizer()),
    ("classifier", MultinomialNB())
])

# Train model
pipeline_tfidf.fit(X_train, y_train)

# Evaluate
y_pred_tfidf = pipeline_tfidf.predict(X_test)
print("Accuracy (TF-IDF):", accuracy_score(y_test, y_pred_tfidf))
print(classification_report(y_test, y_pred_tfidf))


Accuracy (TF-IDF): 0.967713004484305
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.76      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115



#### Text Classification using Word Embeddings (Word2Vec)

In [8]:
from gensim.models import Word2Vec

# Tokenize sentences
sentences = [text.split() for text in df["cleaned_text"]]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
print(word2vec_model.wv.most_similar("free"))


[('mobile', 0.9997726082801819), ('txt', 0.9997410178184509), ('ur', 0.99972003698349), ('2', 0.9997028708457947), ('reply', 0.9996951222419739), ('phone', 0.9996748566627502), ('get', 0.9996717572212219), ('text', 0.9996702075004578), ('n', 0.9996155500411987), ('call', 0.9996111989021301)]
