In [1]:
# step 1: Imports and Setup
import os
import re
import string
import pandas as pd
import nltk

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy import sparse
import joblib

# Ensure NLTK resources are available
resources = ['punkt', 'stopwords', 'wordnet', 'averaged_perceptron_tagger', 'omw-1.4']
for res in resources:
    try:
        nltk.data.find(res)
    except LookupError:
        print(f"Downloading NLTK resource: {res} ...")
        nltk.download(res, quiet=True)


Downloading NLTK resource: punkt ...
Downloading NLTK resource: stopwords ...
Downloading NLTK resource: wordnet ...
Downloading NLTK resource: averaged_perceptron_tagger ...
Downloading NLTK resource: omw-1.4 ...


In [3]:
# step 2: Load dataset
# Change path if needed
df = pd.read_csv("C:/Users/Nagababu/Downloads/spamdata.csv.csv", encoding="latin-1")

# Rename columns if necessary
if 'v1' in df.columns and 'v2' in df.columns:
    df = df.rename(columns={'v1': 'label', 'v2': 'text'})

df = df[['label','text']].dropna().reset_index(drop=True)
df.head()


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# step 3: Cleaning function
URL_PATTERN = re.compile(r"http\\S+|www\\.\\S+")
HTML_PATTERN = re.compile(r"<.*?>")

def clean_text(text):
    text = str(text).lower()
    text = URL_PATTERN.sub("", text)
    text = HTML_PATTERN.sub("", text)
    text = re.sub(r'\\S+@\\S+', '', text)  # remove emails
    text = re.sub(r'\\d+', '', text)       # remove numbers
    text = text.translate(str.maketrans("", "", string.punctuation))  # remove punctuation
    text = re.sub(r'\\s+', ' ', text).strip()
    return text

df['clean_text'] = df['text'].apply(clean_text)
df[['text','clean_text']].head()


Unnamed: 0,text,clean_text
0,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...


In [None]:
# step 4: Tokenization
df['tokens'] = df['clean_text'].apply(word_tokenize)
df[['clean_text','tokens']].head()


Unnamed: 0,clean_text,tokens
0,go until jurong point crazy available only in ...,"[go, until, jurong, point, crazy, available, o..."
1,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, u, c, already, t..."
4,nah i dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l..."


In [None]:
# Cell 5: Stopword Removal
stop_words = set(stopwords.words('english')) | set(['u','im','ur'])
df['no_stopwords'] = df['text'].apply(lambda x: [w for w in x if w not in stop_words])
df[['text','no_stopwords']].head()


Unnamed: 0,text,no_stopwords
0,"Go until jurong point, crazy.. Available only ...","[G, , n, l, , j, r, n, g, , p, n, ,, , c, ..."
1,Ok lar... Joking wif u oni...,"[O, k, , l, r, ., ., ., , J, k, n, g, , w, ..."
2,Free entry in 2 a wkly comp to win FA Cup fina...,"[F, r, e, e, , e, n, r, , n, , 2, , , w, ..."
3,U dun say so early hor... U c already then say...,"[U, , n, , , , e, r, l, , h, r, ., ., ., ..."
4,"Nah I don't think he goes to usf, he lives aro...","[N, h, , I, , n, ', , h, n, k, , h, e, , ..."


In [8]:

df['no_stopwords'] = df['text'].apply(lambda x: [w for w in x.split() if w not in stop_words])
df[['text', 'no_stopwords']].head()

Unnamed: 0,text,no_stopwords
0,"Go until jurong point, crazy.. Available only ...","[Go, jurong, point,, crazy.., Available, bugis..."
1,Ok lar... Joking wif u oni...,"[Ok, lar..., Joking, wif, oni...]"
2,Free entry in 2 a wkly comp to win FA Cup fina...,"[Free, entry, 2, wkly, comp, win, FA, Cup, fin..."
3,U dun say so early hor... U c already then say...,"[U, dun, say, early, hor..., U, c, already, sa..."
4,"Nah I don't think he goes to usf, he lives aro...","[Nah, I, think, goes, usf,, lives, around, tho..."


In [None]:
# step 6: Stemming
stemmer = PorterStemmer()
df['stemmed'] = df['no_stopwords'].apply(lambda x: [stemmer.stem(w) for w in x])
df[['text','stemmed']].head()


Unnamed: 0,text,stemmed
0,"Go until jurong point, crazy.. Available only ...","[go, jurong, point,, crazy.., avail, bugi, n, ..."
1,Ok lar... Joking wif u oni...,"[ok, lar..., joke, wif, oni...]"
2,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entri, 2, wkli, comp, win, fa, cup, fin..."
3,U dun say so early hor... U c already then say...,"[u, dun, say, earli, hor..., u, c, alreadi, sa..."
4,"Nah I don't think he goes to usf, he lives aro...","[nah, i, think, goe, usf,, live, around, though]"


In [None]:

stemmer = PorterStemmer()
df['stemmed'] = df['no_stopwords'].apply(lambda x: [stemmer.stem(w) for w in x])
df[['text','stemmed']].head()


Unnamed: 0,text,stemmed
0,"Go until jurong point, crazy.. Available only ...","[go, jurong, point,, crazy.., avail, bugi, n, ..."
1,Ok lar... Joking wif u oni...,"[ok, lar..., joke, wif, oni...]"
2,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entri, 2, wkli, comp, win, fa, cup, fin..."
3,U dun say so early hor... U c already then say...,"[u, dun, say, earli, hor..., u, c, alreadi, sa..."
4,"Nah I don't think he goes to usf, he lives aro...","[nah, i, think, goe, usf,, live, around, though]"


In [None]:
# step 7: Lemmatization
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {
        'J': wordnet.ADJ,
        'N': wordnet.NOUN,
        'V': wordnet.VERB,
        'R': wordnet.ADV
    }
    return tag_dict.get(tag, wordnet.NOUN)

df['lemmatized'] = df['no_stopwords'].apply(lambda x: [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in x])
df[['no_stopwords','lemmatized']].head()


Unnamed: 0,no_stopwords,lemmatized
0,"[Go, jurong, point,, crazy.., Available, bugis...","[Go, jurong, point,, crazy.., Available, bugis..."
1,"[Ok, lar..., Joking, wif, oni...]","[Ok, lar..., Joking, wif, oni...]"
2,"[Free, entry, 2, wkly, comp, win, FA, Cup, fin...","[Free, entry, 2, wkly, comp, win, FA, Cup, fin..."
3,"[U, dun, say, early, hor..., U, c, already, sa...","[U, dun, say, early, hor..., U, c, already, sa..."
4,"[Nah, I, think, goes, usf,, lives, around, tho...","[Nah, I, think, go, usf,, life, around, though]"


In [None]:
#  Lemmatization
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {
        'J': wordnet.ADJ,
        'N': wordnet.NOUN,
        'V': wordnet.VERB,
        'R': wordnet.ADV
    }
    return tag_dict.get(tag, wordnet.NOUN)

df['lemmatized'] = df['no_stopwords'].apply(lambda x: [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in x])
df['lemmatized'].head()


0    [Go, jurong, point,, crazy.., Available, bugis...
1                    [Ok, lar..., Joking, wif, oni...]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3    [U, dun, say, early, hor..., U, c, already, sa...
4      [Nah, I, think, go, usf,, life, around, though]
Name: lemmatized, dtype: object

In [None]:
# step 8: Preprocessed text column
df['preprocessed_text'] = df['lemmatized'].apply(lambda x: " ".join(x))
df[['text','preprocessed_text']].head()


Unnamed: 0,text,preprocessed_text
0,"Go until jurong point, crazy.. Available only ...","Go jurong point, crazy.. Available bugis n gre..."
1,Ok lar... Joking wif u oni...,Ok lar... Joking wif oni...
2,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry 2 wkly comp win FA Cup final tkts 2...
3,U dun say so early hor... U c already then say...,U dun say early hor... U c already say...
4,"Nah I don't think he goes to usf, he lives aro...","Nah I think go usf, life around though"


In [None]:
# step 9: Bag of Words (BoW)
from sklearn.feature_extraction.text import CountVectorizer

# Join lemmatized tokens back into text for vectorization
df['preprocessed_text'] = df['lemmatized'].apply(lambda x: " ".join(x))

vectorizer_bow = CountVectorizer()
X_bow = vectorizer_bow.fit_transform(df['preprocessed_text'])

print("Bag of Words shape:", X_bow.shape)
print("Sample feature names:", vectorizer_bow.get_feature_names_out()[:20])


Bag of Words shape: (5572, 8172)
Sample feature names: ['00' '000' '000pes' '008704050406' '0089' '0121' '01223585236'
 '01223585334' '0125698789' '02' '0207' '02072069400' '02073162414'
 '02085076972' '021' '03' '04' '0430' '05' '050703']


In [None]:
# step 10: TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_tfidf = TfidfVectorizer()
X_tfidf = vectorizer_tfidf.fit_transform(df['preprocessed_text'])

print("TF-IDF shape:", X_tfidf.shape)
print("Sample feature names:", vectorizer_tfidf.get_feature_names_out()[:20])


TF-IDF shape: (5572, 8172)
Sample feature names: ['00' '000' '000pes' '008704050406' '0089' '0121' '01223585236'
 '01223585334' '0125698789' '02' '0207' '02072069400' '02073162414'
 '02085076972' '021' '03' '04' '0430' '05' '050703']


In [32]:

# Step 11: Feature Extraction using TF-IDF Vectorizer
# This is the missing piece of code.
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['text']) # Assuming 'text' is your text column

# Step 2: Prepare data for classification
X = X_tfidf
y = df['label']

# Step 3: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Step 4: Train Naive Bayes classifier
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Step 5: Predictions and Evaluation
y_pred = nb.predict(X_test)
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

✅ Accuracy: 0.9614349775784753

Classification Report:
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98       966
        spam       1.00      0.71      0.83       149

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.90      1115
weighted avg       0.96      0.96      0.96      1115


Confusion Matrix:
 [[966   0]
 [ 43 106]]
