# Load Dataset

In [30]:
import pandas as pd

df = pd.read_csv(
    "SMSSpamCollection",
    sep="\t",
    header=None,
    names=["label", "text"]
)

print(df.head())
print(df.shape)

  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
(5572, 2)


# Clean Data

In [31]:
# remove duplicates
df = df.drop_duplicates()

# give numeric labels
df["label"] = df["label"].map({"ham": 0, "spam": 1})
print(df["label"].value_counts())

label
0    4516
1     653
Name: count, dtype: int64


In [32]:
import re

def clean_text(text):
    text = text.lower()                      # lowercase
    text = re.sub(r'\d+', ' ', text)         # remove numbers
    text = re.sub(r'[^\w\s]', ' ', text)     # remove punctuation
    text = re.sub(r'\s+', ' ', text)         # remove extra spaces
    return text.strip()

df["clean_text"] = df["text"].apply(clean_text)
print(df[["text", "clean_text"]].head())

                                                text  \
0  Go until jurong point, crazy.. Available only ...   
1                      Ok lar... Joking wif u oni...   
2  Free entry in 2 a wkly comp to win FA Cup fina...   
3  U dun say so early hor... U c already then say...   
4  Nah I don't think he goes to usf, he lives aro...   

                                          clean_text  
0  go until jurong point crazy available only in ...  
1                            ok lar joking wif u oni  
2  free entry in a wkly comp to win fa cup final ...  
3        u dun say so early hor u c already then say  
4  nah i don t think he goes to usf he lives arou...  


In [33]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# removing stopwords improves Naive Bayes performance dramatically
def remove_stopwords(text):
    words = text.split()
    words = [word for word in words if word not in ENGLISH_STOP_WORDS]
    return " ".join(words)

df["clean_text"] = df["clean_text"].apply(remove_stopwords)
df_final = df[["clean_text", "label"]]

# Split data (train/test)

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# split data
X_train, X_test, y_train, y_test = train_test_split(
    df_final["clean_text"],
    df_final["label"],
    test_size=0.2,
    random_state=42,
    stratify=df_final["label"]
)

# convert text to numerical features (for bag of words)
vectorizer = CountVectorizer()

X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

# Implement Naive Bayes and Bag of Words