# Naive Bayes Classifier

In [9]:
# PSL imports,
import collections
import re

# Third-party imports,
import pandas as pd
import numpy as np

# Loading Dataset

Dataset taken from: https://www.kaggle.com/code/zabihullah18/email-spam-detection

In [10]:
# Reading data, 
df = pd.read_csv("spam.csv", encoding="latin1")

# Dropping columns not needed,
df = df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)

# Chaning columns names,
df = df.rename(columns={"v2":"sms_text", "v1":"spam"})

# Remapping,
df["spam"] = df["spam"].map({"ham": 0, "spam": 1})

# Switching columns,
df = df.iloc[:, [1, 0]]

# Display dataframe,
df.head(n=10)

Unnamed: 0,sms_text,spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
5,FreeMsg Hey there darling it's been 3 week's n...,1
6,Even my brother is not like to speak with me. ...,0
7,As per your request 'Melle Melle (Oru Minnamin...,0
8,WINNER!! As a valued network customer you have...,1
9,Had your mobile 11 months or more? U R entitle...,1


# Word Analysis

We want to find the most common words that are in spam SMS texts. First, we define the function for counting the frequency of words,

In [11]:
def count_words(string_obj):
    """Counts the normalised frequency of words in a string and returns them as a dictionary."""

    # Cleaning text,
    string_obj = re.sub(r"[^\w\s']", "", string_obj.lower().strip())

    # Creating list of words,
    word_list = string_obj.split()
    word_dict = {}

    # Counting total words,
    n_words = len(word_list)

    for word in word_list:
        if word in word_dict:
            # Incrementing (normalised) count if word exists,
            word_dict[word] += 1/n_words
        else:
            # Initialising (normalised) count if word does not exist in dictionary,
            word_dict[word] = 1/n_words

    # Sorting dictionary in descending order,
    word_dict = dict(sorted(word_dict.items(), key=lambda item: item[1], reverse=True))

    return word_dict

We want to consider words which are much more likely to show up in spam SMS texts compared to regular texts, but are also not rare.

In [15]:
# Construcing spam and ham dataframes,
df_spam = df[df["spam"] == 1]
df_ham = df[df["spam"] == 0]

# Extracting all spam and ham texts into a string,
spam_string = ""
ham_string = ""

for text in df_spam["sms_text"]:
    spam_string += text

for text in df_ham["sms_text"]:
    ham_string += text

# Counting words,
spam_words = count_words(spam_string)
ham_words = count_words(ham_string)

# PARAMETERS,
THRESHOLD = 1.5

"""Finding the top words which have the biggest difference in probability of appearing in spam texts compared
to those of regular ones."""

spammy_words = []
for spam_word in spam_words:
    if spam_words[spam_word] > THRESHOLD * ham_words.get(spam_word, 0):
        prob_diff = spam_words[spam_word] - ham_words.get(spam_word, 0)
        spammy_words.append([spam_word, prob_diff])

spammy_words = sorted(spammy_words, key=lambda x: x[1], reverse=True)
print(spammy_words[:50])

[['call', 0.016528331946966957], ['to', 0.016033022403738204], ['free', 0.009975193739621474], ['txt', 0.008634706474211041], ['your', 0.00855729722424854], ['or', 0.007631647755458866], ['now', 0.00702437964389313], ['mobile', 0.006944628926538471], ['claim', 0.006633499170812593], ['text', 0.0057966940252954825], ['stop', 0.005522304840908529], ['2', 0.005463448950449189], ['reply', 0.00534462182740462], ['from', 0.005148385454397471], ['prize', 0.005093579720445386], ['4', 0.004448466961497021], ['won', 0.004323619995261782], ['our', 0.0038463587155233322], ['ur', 0.0038360389843392468], ['nokia', 0.0038180958709486184], ['cash', 0.0035294532150517855], ['contact', 0.003126533055589858], ['guaranteed', 0.002961383558398482], ['service', 0.002957205719627638], ['new', 0.0029103895155496795], ['win', 0.0028620974547194176], ['tone', 0.0027837005448945735], ['customer', 0.002451681595314475], ['per', 0.0023999496240895203], ['chat', 0.002329048414207692], ['awarded', 0.0022506515043828

The top 50 words which appear in SMS spam texts are considered. Now we construct a vector for each text in the dataset. Each vector has a feature length of the number of words.  

In [16]:
def create_feature_vector(string_obj, features):
    """Creates a binary feature vector given a SMS text and a list of feature names."""

    feature_vector = []
    for feature in features:

        # Spammy word not found,
        if string_obj.count(feature) == 0:
            feature_vector.append(0)
        
        # Spammy word found at least once,
        else:
            feature_vector.append(1)

    # Returns a list,
    return feature_vector

Creating our new dataset,

In [17]:
FEATURES = ['call', 'to', 'free', 'txt', 'your', 'or', 'now', 'mobile', 'claim', 'text', 'stop', '2',
 'reply', 'from', 'prize', '4', 'won', 'our', 'ur', 'nokia', 'cash', 'contact', 'guaranteed',
 'service', 'new', 'win', 'tone', 'customer', 'per', 'chat', 'awarded', 'with', 'draw', 'å1000',
 'week', 'who', 'latest', 'line', 'send', 'receive', '18', 'å2000', 'mins', 'landline', 'shows',
 'camera', '16', 'box', 'only', 'holiday']

# Creating feature vectors,
feature_vectors = []
for text in df["sms_text"]:
    feature_vector = create_feature_vector(text, FEATURES)
    feature_vectors.append(feature_vector)

feature_vectors = np.array(feature_vectors)

# Creating target vector,
targets = df["spam"].to_numpy()

# Saving,
np.savez("dataset.npz", features=feature_vectors, targets=targets, feature_labels=FEATURES)