In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

## 1. loaded data 

In [2]:
batch_path = os.getcwd()
path = os.path.join(batch_path, '..', 'data', 'spam.csv')
df = pd.read_csv(path, sep=',', encoding="latin-1")
df.head()
 

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
## drop the nan
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df = df[['v1', 'v2']]
df.columns= ['label', 'target']
df.head()


Unnamed: 0,label,target
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.isnull().sum()

label     0
target    0
dtype: int64

## 2. claning the data 

In [5]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


class TextPreprocessor:

    def __init__(self):
        nltk.download('punkt')
        nltk.download('stopwords')
        nltk.download('wordnet')

        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def preprocess(self, text):

        text = str(text).lower()
        text = text.translate(str.maketrans('', '', string.punctuation))

        tokens = word_tokenize(text)

        # Verb lemmatization (win, won, winning → win)
        tokens = [self.lemmatizer.lemmatize(word, pos='v') for word in tokens]

        tokens = [word for word in tokens if word not in self.stop_words]

        return " ".join(tokens)


## 3. spliting the data and vectorized from data

In [6]:
df.head()

Unnamed: 0,label,target
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
text_convert = TextPreprocessor()
X = df['target'].apply(text_convert.preprocess)

y = df['label']

y = y.map({'ham':0, 'spam':1})
len(X), len(y)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sumit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sumit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sumit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


(5572, 5572)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# TF-IDF Vectorization
tfidf = TfidfVectorizer()

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# SMOTE works only with dense data → convert sparse to dense
smote = SMOTE(random_state=42)

X_train_smote, y_train_smote = smote.fit_resample(
    X_train_tfidf.toarray(), y_train
)

# Train Model
model = LogisticRegression()
model.fit(X_train_smote, y_train_smote)

# Prediction
y_pred = model.predict(X_test_tfidf.toarray())


In [9]:
model.score(X_test_tfidf, y_test)

0.9766816143497757

In [10]:
model.predict(X_test_tfidf[:20].toarray()), y_test[:20]

(array([0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0]),
 3245    0
 944     0
 1044    1
 2484    0
 812     1
 2973    0
 2991    0
 2942    0
 230     0
 1181    0
 1912    0
 1992    1
 5435    0
 4805    0
 401     0
 1859    0
 1344    0
 2952    1
 501     0
 3337    0
 Name: label, dtype: int64)

## 4. evlaution


In [11]:
from sklearn.metrics import confusion_matrix, f1_score

In [12]:
y_pred = model.predict(X_test_tfidf.toarray())
confusion_matrix(y_test, y_pred), f1_score(y_test, y_pred)

(array([[954,  11],
        [ 15, 135]]),
 0.9121621621621622)

In [13]:
import pandas as pd

df = pd.DataFrame({
    "TEXT": ["Congratulations! You won a free ticket!", "Hey, are we meeting today?", 
             "Claim your prize now!", "Let's have lunch tomorrow."],
    "LABEL": ["spam", "ham", "spam", "ham"]
})
df['TEXT'] = df['TEXT'].apply(text_convert.preprocess)


In [14]:
df['TEXT']

0    congratulations win free ticket
1                     hey meet today
2                        claim prize
3                 let lunch tomorrow
Name: TEXT, dtype: object

In [15]:
test_tfd = tfidf.transform(df['TEXT'])

Y_preds = model.predict(test_tfd.toarray())

In [16]:
Y_preds

array([1, 0, 1, 0])

In [17]:
pred_prob = model.predict_proba(test_tfd.toarray())
spam_pred = pred_prob[:,1] > 0.5

In [18]:
spam_pred

array([ True, False,  True, False])

## 5. Save the model

In [19]:
import joblib
import os


batch_model  = os.getcwd()
path_model = os.path.join(batch_model, '..', 'model.pkl')

batch_tefd  = os.getcwd()
path_vector = os.path.join(batch_tefd, '..', 'vector.pkl')

batch_text = os.getcwd()
path_text = os.path.join(batch_text, '..', 'text_convert.pkl')

joblib.dump(model, path_model)
joblib.dump(tfidf, path_vector)
joblib.dump(text_convert, path_text)


['c:\\Users\\sumit\\OneDrive\\Desktop\\project work\\Spam-SMS-Classifier-using-NLP\\models\\..\\text_convert.pkl']