In [1]:
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
import numpy as np
import re

In [2]:
def bag_of_words(data, operation="train"):
    data = data.sample(frac=1, random_state=43)
    train_last_index = int(data.shape[0] * 0.8)

    vocab = set()
    for row in data['SMS']:
        vocab.update(row.split())
    col_index = list(vocab)
    col_index.sort()
    
    if operation == 'train':
        data = data.iloc[0:train_last_index]
    elif operation == 'test':
        data = data.iloc[train_last_index:]
    else:
        raise ValueError
        
    data.reset_index(drop=True, inplace=True)

    bag = pd.DataFrame(0, index=range(len(data)), columns=col_index)
    for i, row in data['SMS'].iteritems():
        for word in row.split():
            if word in vocab:
                bag.loc[i, word] += 1

    return pd.concat([data, bag], axis=1)

In [3]:
# loading data
df = pd.read_csv("data/spam.csv", encoding='iso-8859-1', header=0, usecols=[0, 1], dtype=str)
df.columns = ['Target', 'SMS']
en_sm_model = spacy.load("en_core_web_sm")
df['Target'] = df['Target'].str.strip()

In [4]:
# preprocessing
for i, row in df.iterrows():
        text = row['SMS'].lower()
        doc = en_sm_model(text)
        new_text = []
        for token in doc:
            if re.search(r"[0-9]+", token.lemma_):
                new_text.append("aanumbers")
            else:
                word = "".join([ch for ch in token.lemma_.lower() if ch not in punctuation])
                if len(word) > 1 and word not in STOP_WORDS:
                    new_text.append(word)

        df['SMS'][i] = " ".join(new_text)

In [5]:
# training
train_bag_of_words = bag_of_words(df)
test_bag_of_words = bag_of_words(df, 'test')

In [6]:
# counting probabilities
laplace_smoothing = 1

spam_word_count = train_bag_of_words.loc[train_bag_of_words['Target'] == 'spam', 'aa':].aggregate(np.sum, axis=0)
vocab_length = train_bag_of_words.iloc[:, 2:].shape[1]
spam_length = train_bag_of_words.loc[train_bag_of_words['Target'] == 'spam'].SMS.str.split().apply(len).sum()
spam_probability = (spam_word_count + laplace_smoothing) / (laplace_smoothing * vocab_length + spam_length)

ham_word_count = train_bag_of_words.loc[train_bag_of_words['Target'] == 'ham', 'aa':].aggregate(np.sum, axis=0)
ham_length = train_bag_of_words.loc[train_bag_of_words['Target'] == 'ham'].SMS.str.split().apply(len).sum()
ham_probability = (ham_word_count + laplace_smoothing) / (laplace_smoothing * vocab_length + ham_length)

In [7]:
probabilities = pd.DataFrame(data={'Spam Probability':spam_probability, 'Ham Probability':ham_probability}, 
                             index=spam_probability.index)

print(probabilities.iloc[:200])



             Spam Probability  Ham Probability
aa                   0.000062         0.000062
aah                  0.000062         0.000123
aaniye               0.000062         0.000031
aanumbers            0.148248         0.030110
aaooooright          0.000062         0.000031
...                       ...              ...
amp                  0.000062         0.001970
amplikater           0.000062         0.000062
amrca                0.000062         0.000062
amrita               0.000062         0.000031
ams                  0.000062         0.000031

[200 rows x 2 columns]


In [8]:
P_SPAM = spam_length / (ham_length + spam_length)
P_HAM = ham_length / (ham_length + spam_length)

print(f'{P_SPAM=}, {P_HAM=}')



P_SPAM=0.2663527542973847, P_HAM=0.7336472457026153


In [9]:
def spam_classifier(text, spam_prob, prob_table):
        p_spam = spam_prob
        p_ham = 1 - spam_prob
    #     print(text)
        for _word in text.split(' '):
            try:
                p_spam *= prob_table.loc[_word, 'Spam Probability']
                p_ham *= prob_table.loc[_word, 'Ham Probability']
            except KeyError:
                pass
        if p_ham > p_spam:
            return 'ham'
        elif p_spam > p_ham:
            return 'spam'
        return 'unknown'

In [10]:
sms_prediction = test_bag_of_words.SMS.map(lambda x: spam_classifier(x, P_SPAM, probabilities))


In [11]:
result = pd.DataFrame({'Predicted': sms_prediction, 'Actual': test_bag_of_words['Target']}, index=sms_prediction.index)
result.head(200)

Unnamed: 0,Predicted,Actual
0,ham,ham
1,ham,ham
2,spam,spam
3,ham,ham
4,ham,ham
...,...,...
195,spam,spam
196,ham,ham
197,ham,ham
198,ham,ham


In [12]:
TP = result.loc[(result['Predicted'] == 'ham') & (result['Actual'] == 'ham')].shape[0]
FP = result.loc[(result['Predicted'] == 'ham') & (result['Actual'] == 'spam')].shape[0]
FN = result.loc[(result['Predicted'] == 'spam') & (result['Actual'] == 'ham')].shape[0]
TN = result.loc[(result['Predicted'] == 'spam') & (result['Actual'] == 'spam')].shape[0]

print(f'{TN=}, {FN=}, {FP=}, {TP=}')

TN=143, FN=38, FP=5, TP=929


In [13]:
Accuracy = (TP + TN) / (TP + TN + FP + FN)
Recall = TP / (TP + FN)
Precision = TP / (TP + FP)
F1 = 2 * Precision * Recall / (Precision + Recall)
metrics = {'Accuracy': Accuracy, 'Recall': Recall, 'Precision': Precision, 'F1': F1}



In [14]:
from sklearn.naive_bayes import MultinomialNB

In [15]:
test_bag_of_words['TargetBinary'] = test_bag_of_words['Target'].apply(lambda x: 0 if x == 'ham' else 1)
train_bag_of_words['TargetBinary'] = train_bag_of_words['Target'].apply(lambda x: 0 if x == 'ham' else 1)

In [16]:
classifier = MultinomialNB()
classifier.fit(X=train_bag_of_words.loc[:, 'aa':], y=train_bag_of_words['TargetBinary'])
prediction = classifier.predict(X=test_bag_of_words.loc[:, 'aa':])

In [17]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

classifier_metrics = {'Accuracy': accuracy_score(test_bag_of_words['TargetBinary'], prediction),
                      'Recall': recall_score(test_bag_of_words['TargetBinary'], prediction),
                      'Precision': precision_score(test_bag_of_words['TargetBinary'], prediction),
                      'F1': f1_score(test_bag_of_words['TargetBinary'], prediction)}

print(metrics)
print(classifier_metrics)

{'Accuracy': 0.9614349775784753, 'Recall': 0.9607032057911065, 'Precision': 0.9946466809421841, 'F1': 0.9773803261441346}
{'Accuracy': 0.9856502242152466, 'Recall': 1.0, 'Precision': 0.9024390243902439, 'F1': 0.9487179487179488}
