# Spam filter in python: Naives Bayes from Scatch

[Spam Filter in Python: Naive Bayes from Scratch - KDnuggets](https://www.kdnuggets.com/2020/07/spam-filter-python-naive-bayes-scratch.html)


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pyhere import here
from ucimlrepo import fetch_ucirepo, list_available_datasets
  
# fetch dataset 
# heart_disease = fetch_ucirepo(id=45) 

# https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip


In [None]:
# availalble_datasets_df = list_available_datasets(search="spam")
# availalble_datasets_df

In [16]:
sms_spam = pd.read_csv(here("data", "ch02", "spam", "SMSSpamCollection"), sep="\t", header=None)
sms_spam.columns = ["label", "sms"]
sms_spam.head()

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [15]:
sms_spam.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [None]:
# spam vs ham - counts and percentages
sms_spam['label'].value_counts().to_frame().assign(
    percentage=lambda df: np.round(df['count'] / df['count'].sum() * 100, 1))

Unnamed: 0_level_0,count,percentage
label,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,4825,86.6
spam,747,13.4


Majority of cases are _ham_ been 86.6%.

In [24]:
# Training and test set
randomized_sms_spam = sms_spam.sample(frac=1, random_state=1)


In [42]:
# calculate index for split
training_test_index = round(len(randomized_sms_spam) * 0.8)
# Training/Test split
training_set = randomized_sms_spam[:training_test_index].reset_index(drop=True)
test_set = randomized_sms_spam[training_test_index:].reset_index(drop=True)
print(training_set.shape)
print(test_set.shape)

(4458, 2)
(1114, 2)


In [43]:
training_set.head()

Unnamed: 0,label,sms
0,ham,"Yep, by the pretty sculpture"
1,ham,"Yes, princess. Are you going to make me moan?"
2,ham,Welp apparently he retired
3,ham,Havent.
4,ham,I forgot 2 ask ü all smth.. There's a card on ...


In [44]:
# check percentages of spam and ham in the training and test sets
training_set['label'].value_counts(normalize=True)


label
ham     0.86541
spam    0.13459
Name: proportion, dtype: float64

In [37]:
test_set['label'].value_counts(normalize=True)

label
ham     0.868043
spam    0.131957
Name: proportion, dtype: float64

In [None]:
# Cleaning the texts
def clean_text(text):
    import re
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\d', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text


In [45]:
training_set['sms'] = training_set['sms'].apply(clean_text)
# test_set = test_set['sms'].apply(clean_text)

In [46]:
training_set.head()

Unnamed: 0,label,sms
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired
3,ham,havent
4,ham,i forgot ask ü all smth there s a card on da p...


In [48]:
# create the vocabulary
vocabulary = []
for sms in training_set['sms']:
    for word in sms.split():
        vocabulary.append(word)
vocabulary = list(set(vocabulary))
len(vocabulary)

6986

In [51]:
# vocabulary.sort()
vocabulary[:10]

['_', '____', 'a', 'aa', 'aah', 'aaniye', 'aathi', 'ab', 'abbey', 'abdomen']

In [57]:
# create a dictionary mapping of the vocabulary to numbers

# vocabulary_mapping = {word: i for i, word in enumerate(vocabulary)}
list(vocabulary_mapping.items())[:10]

[('_', 0),
 ('____', 1),
 ('a', 2),
 ('aa', 3),
 ('aah', 4),
 ('aaniye', 5),
 ('aathi', 6),
 ('ab', 7),
 ('abbey', 8),
 ('abdomen', 9)]

In [61]:
ham_spam_array = np.zeros((2, len(vocabulary)))

ham_spam_array

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(2, 6986))

In [64]:
training_set.head()

Unnamed: 0,label,sms
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired
3,ham,havent
4,ham,i forgot ask ü all smth there s a card on da p...


In [79]:
'yep' in training_set.iloc[0, :]['sms'].split()


True

In [None]:
# For each unique word in the vocabulary, loop through the training set and count the number of times the word appears in each class (ham or spam)
# Store the counts in the ham_spam_array
# The first row of the array will contain the counts for the ham class
# The second row of the array will contain the counts for the spam class
if os.path.exists(here('data', 'ch02', 'spam', 'ham_spam_array.npy')):
    ham_spam_array = np.load(here('data', 'ch02', 'spam', 'ham_spam_array.npy'))
else:
    for i, word in enumerate(vocabulary):
        for j in range(len(training_set)):
            if word in training_set.iloc[j, :]['sms'].split():
                if training_set.iloc[j, :]['label'] == 'ham':
                    ham_spam_array[0, i] += 1
                else:
                    ham_spam_array[1, i] += 1
    np.save(here('data', 'ch02', 'spam', 'ham_spam_array.npy'), ham_spam_array)
    

In [82]:
ham_spam_array

array([[  2.,   2., 725., ..., 105.,   1.,   1.],
       [  0.,   0., 237., ...,   0.,   0.,   0.]], shape=(2, 6986))

In [84]:
training_set['sms'] = training_set['sms'].str.split()
training_set.head()

Unnamed: 0,label,sms
0,ham,"[yep, by, the, pretty, sculpture]"
1,ham,"[yes, princess, are, you, going, to, make, me,..."
2,ham,"[welp, apparently, he, retired]"
3,ham,[havent]
4,ham,"[i, forgot, ask, ü, all, smth, there, s, a, ca..."


In [86]:
word_counts_per_sms = {unique_word: [0] * len(training_set['sms']) for unique_word in vocabulary}

In [92]:
for index, sms in enumerate(training_set['sms']):
   for word in sms:
      word_counts_per_sms[word][index] += 1

In [93]:
np.save(here('data', 'ch02', 'spam', 'word_counts_per_sms.npy'), word_counts_per_sms)
# word_counts_per_sms = np.load(here('data', 'ch02', 'spam', 'word_counts_per_sms.npy'), allow_pickle=True).item()

In [94]:
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts.head()

Unnamed: 0,_,____,a,aa,aah,aaniye,aathi,ab,abbey,abdomen,...,zoe,zogtorius,zouk,zs,zyada,é,ú,ü,〨ud,鈥
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0


In [95]:
training_set_clean = pd.concat([training_set, word_counts], axis=1)
training_set_clean.head()

Unnamed: 0,label,sms,_,____,a,aa,aah,aaniye,aathi,ab,...,zoe,zogtorius,zouk,zs,zyada,é,ú,ü,〨ud,鈥
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, ask, ü, all, smth, there, s, a, ca...",0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0


In [97]:
# Isolating spam and ham messages first
spam_messages = training_set_clean[training_set_clean['label'] == 'spam']
ham_messages = training_set_clean[training_set_clean['label'] == 'ham']

In [101]:
# P(Spam) and P(Ham)
p_spam = len(spam_messages) / len(training_set_clean)
p_ham = len(ham_messages) / len(training_set_clean)

In [103]:
# N_Spam
n_words_per_spam_message = spam_messages['SMS'].apply(len)
n_spam = n_words_per_spam_message.sum()

# N_Ham
n_words_per_ham_message = ham_messages['SMS'].apply(len)
n_ham = n_words_per_ham_message.sum()

# N_Vocabulary
n_vocabulary = len(vocabulary)

# Laplace smoothing
alpha = 1

0.13458950201884254


The following is asking Gemini Pro 2.5 with the following prompt:

> You are a data scientist and expert python programmer with knowledge in packages dealing with data analysis and machine learning. I have a data wrangling and reshaping problem that I want to find a more efficient way of dealing with. Here is an screenshot of a subset of data from a pandas data frame. It has two columns, label and sms. The sms column is a list of words. I want to convert the unique list of words into columns and for each row, column cell to have the count of the number of times that word appears in the entire dataset. I would prefer a functional programming approach in solving this.


In [None]:
import pandas as pd
from collections import Counter
import itertools

# --- Sample Data Setup (mimicking your screenshot) ---
data = {
    'label': ['ham', 'ham', 'ham', 'ham', 'ham', 'ham'], # Now length 6
    'sms': [
        ['yep', 'by', 'the', 'pretty', 'sculpture'],
        ['yes', 'princess', 'are', 'you', 'going', 'to', 'make', 'me', '...'],
        ['welp', 'apparently', 'he', 'retired'],
        ['havent'],
        ['i', 'forgot', 'ask', 'ü', 'all', 'smth', 'there', 's', 'a', 'ca...'],
        ['yes', 'you', 'are', 'pretty'] # List length 6
    ]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
print("-" * 30)

# --- Functional Approach ---

# 1. Flatten the list of lists into a single iterator of words
#    itertools.chain.from_iterable is efficient for this.
# all_words_iterator = itertools.chain.from_iterable(df['sms'])
all_words_iterator = itertools.chain.from_iterable(training_set['sms'])

# 2. Count the frequency of each word across the entire dataset
#    collections.Counter is highly optimized for counting hashable objects.
total_word_counts = Counter(all_words_iterator)

# 3. Create the new DataFrame structure
#    - Get the number of rows for repeating the counts.
#    - Use a dictionary comprehension to create the data for the new DataFrame.
#      Each key is a word (column name), and each value is a list containing
#      the total count repeated for every row.
num_rows = len(df)
word_count_data = {
    word: [count] * num_rows
    for word, count in total_word_counts.items()
}

# 4. Create the DataFrame containing the word counts
#    Using the original DataFrame's index ensures easy alignment/concatenation.
word_counts_df = pd.DataFrame(word_count_data, index=df.index)

# 5. Combine the original 'label' column with the new word count columns
#    Using pd.concat along axis=1. You could also use df.join().
#    Select only the columns you want to keep from the original df.
final_df = pd.concat([df[['label']], word_counts_df], axis=1)


# --- Display Result ---
print("\nTransformed DataFrame:")
print(final_df)

# --- Display specific columns to verify counts (e.g., 'yes', 'are', 'pretty', 'you') ---
print("\nVerification (columns with count > 1):")
print(final_df[['label', 'yes', 'are', 'pretty', 'you']])

In [104]:
from collections import Counter
import itertools

# 1. Flatten the list of lists into a single iterator of words
#    itertools.chain.from_iterable is efficient for this.
all_words_iterator = itertools.chain.from_iterable(training_set['sms'])

True