In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../sms+spam+collection/SMSSpamCollection', sep = '\t')
df.head()

Unnamed: 0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
0,ham,Ok lar... Joking wif u oni...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,U dun say so early hor... U c already then say...
3,ham,"Nah I don't think he goes to usf, he lives aro..."
4,spam,FreeMsg Hey there darling it's been 3 week's n...


In [3]:
df = pd.read_csv('../sms+spam+collection/SMSSpamCollection', sep = '\t', header = None, names = ['label','message'])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
print("Total messages:", len(df))
print(df['label'].value_counts())

Total messages: 5572
label
ham     4825
spam     747
Name: count, dtype: int64


In [5]:
print("\nSpam example:")
print(df[df['label'] == 'spam']['message'].iloc[0])


Spam example:
Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's


In [6]:

print("\nHam example:")
print(df[df['label'] == 'ham']['message'].iloc[0])


Ham example:
Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...


In [7]:
print(df.isnull().sum())

label      0
message    0
dtype: int64


In [8]:
# !pip install nltk

In [9]:
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /home/unique/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/unique/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/unique/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/unique/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [10]:
lemmatizer = WordNetLemmatizer()

In [11]:
def preprocess(message):
    message = message.lower()
    tokens = word_tokenize(message)
    stop_words = set(stopwords.words('english'))

    clean_tokens = [
        lemmatizer.lemmatize(token) 
        for token in tokens 
        if token not in stop_words and token not in string.punctuation
    ]
    return ' '.join(clean_tokens)

In [12]:
uncleaned_test = 'WINNER!!! Claim your reward @ www.abc.com'


print('Text before cleaning:', uncleaned_test)
clean_test = preprocess(uncleaned_test)
print('Text after cleaning:', clean_test)

Text before cleaning: WINNER!!! Claim your reward @ www.abc.com


Text after cleaning: winner claim reward www.abc.com


In [13]:
uncleaned = df['message'].iloc[0]

In [14]:
print('Text before cleaning:', uncleaned)
clean = preprocess(uncleaned)
print('Text after cleaning:', clean)

Text before cleaning: Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
Text after cleaning: go jurong point crazy .. available bugis n great world la e buffet ... cine got amore wat ...


In [15]:
df['cleaned_message'] = df['message'].apply(preprocess)

In [16]:
df.tail()

Unnamed: 0,label,message,cleaned_message
5567,spam,This is the 2nd time we have tried 2 contact u...,2nd time tried 2 contact u. u £750 pound prize...
5568,ham,Will ü b going to esplanade fr home?,ü b going esplanade fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",pity mood ... suggestion
5570,ham,The guy did some bitching but I acted like i'd...,guy bitching acted like 'd interested buying s...
5571,ham,Rofl. Its true to its name,rofl true name


In [17]:
sentences = ["win a prize now",
             "claim your prize",
             "hi how are you"]

In [18]:
tokens = [sentence.split() for sentence in sentences]
print("Tokens:", tokens)

Tokens: [['win', 'a', 'prize', 'now'], ['claim', 'your', 'prize'], ['hi', 'how', 'are', 'you']]


In [19]:
vocab = []

for row in tokens:
    for word in row:
        vocab.append(word)

vocab = sorted(set(vocab))
print(vocab)

['a', 'are', 'claim', 'hi', 'how', 'now', 'prize', 'win', 'you', 'your']


In [20]:
word2idx = {word: idx for idx, word in enumerate(vocab)}
print(word2idx)

{'a': 0, 'are': 1, 'claim': 2, 'hi': 3, 'how': 4, 'now': 5, 'prize': 6, 'win': 7, 'you': 8, 'your': 9}


In [21]:
bow_vector = []
for row in tokens:
    vector = [0] * len(vocab)
    for word in row:
        index = word2idx[word]
        vector[index] += 1
    
    bow_vector.append(vector)

print("Bag of Words Vector")
print(bow_vector)

Bag of Words Vector
[[1, 0, 0, 0, 0, 1, 1, 1, 0, 0], [0, 0, 1, 0, 0, 0, 1, 0, 0, 1], [0, 1, 0, 1, 1, 0, 0, 0, 1, 0]]


In [22]:
for i, vec in enumerate(bow_vector):
    print(f"Sentence {i+1}: {sentences[i]}")
    print("BoW Vector :", vec)
    print()

Sentence 1: win a prize now
BoW Vector : [1, 0, 0, 0, 0, 1, 1, 1, 0, 0]

Sentence 2: claim your prize
BoW Vector : [0, 0, 1, 0, 0, 0, 1, 0, 0, 1]

Sentence 3: hi how are you
BoW Vector : [0, 1, 0, 1, 1, 0, 0, 0, 1, 0]



In [23]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the vectorizer
cv = CountVectorizer()

In [24]:
X = cv.fit_transform(df['cleaned_message'])  # Bag of words matrix

In [25]:
print("Shape of matrix:", X.shape)

Shape of matrix: (5572, 8152)


In [26]:
print(cv.get_feature_names_out()[5000:5010]) # Some sample Vocab

['necessary' 'necessity' 'neck' 'necklace' 'ned' 'need' 'needa' 'needed'
 'needing' 'needle']


In [27]:
print(X.toarray()[11])

[0 1 0 ... 0 0 0]


In [28]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [29]:
# Convert 'ham' to 0 and 'spam' to 1
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})
df.head()

Unnamed: 0,label,message,cleaned_message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy .. available bugis n gre...,0
1,ham,Ok lar... Joking wif u oni...,ok lar ... joking wif u oni ...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,1
3,ham,U dun say so early hor... U c already then say...,u dun say early hor ... u c already say ...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah n't think go usf life around though,0


In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    X, df['label_num'], test_size=0.2, random_state=42
)

In [31]:
model = MultinomialNB()
model.fit(X_train, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [32]:
y_pred = model.predict(X_test)

In [33]:
from sklearn.metrics import accuracy_score

In [34]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 98.03%


In [35]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

Confusion Matrix:
 [[950  16]
 [  6 143]]


# Custom SMS Test

In [36]:
sample = ["you have won a free lottery claim now", "Hi! Where are you?"]

cleaned = preprocess(sample[0])

sample_vector = cv.transform([cleaned])

prediction = model.predict(sample_vector)

if prediction[0] == 1:
    print("Spam")
else:
    print("Ham")

Spam


In [37]:
cleaned = preprocess(sample[1])

sample_vector = cv.transform([cleaned])

prediction = model.predict(sample_vector)

if prediction[0] == 1:
    print("Spam")
else:
    print("Ham")

Ham
