In [1]:
import string
from sklearn.model_selection import train_test_split, cross_val_score, KFold
import numpy as np 


## Load sms


In [2]:
spam_list = list()
ham_list = list()
def read_stopwords():
    """
    Read all the stopwords from the file "stopwords.txt".
    Returns the collection of stopwords.
    """
    with open("stopwords.txt") as f:
        stopwords = [s.strip() for s in f.readlines()]

    return set(stopwords)

def convert_and_tokenze_string(s, tokens, stopwords):
    """
    Strips a word from all puncuation, whitespace. Then converts
    the word into all lower case.
    """
    #remove new lines and spaces behind and infront of the string
    for word in s.split():
        word = word.strip(string.punctuation + \
            string.whitespace + \
            "”" + "“" + "‘" + "’" + "―" + "—" + \
            string.digits).lower()
        if len(word)>0 and word not in stopwords:
            if word in tokens:
                tokens[word]+=1
            else:
                tokens[word]=1
    return tokens



with open('SmsCollection.csv', encoding='utf-8',  errors='ignore') as email_file:
    for line in email_file:
        if line.startswith("spam;"):
            subject = line.lstrip("spam;")
            spam_list.append(subject)
        elif line.startswith("ham;"):
            subject = line.lstrip("ham;")
            ham_list.append(subject)



### Number of records

In [3]:
stopwords = read_stopwords()

number_of_spam = len(spam_list)
number_of_ham = len(ham_list)
number_total = len(ham_list)+len(spam_list)
print(f'number of spam {number_of_spam}, number of ham {number_of_ham}, total {number_total}')

number of spam 747, number of ham 4827, total 5574


### Building bag of words for train

In [4]:
spam_train, spam_test = train_test_split(spam_list, test_size=0.2)
ham_train, ham_test = train_test_split(ham_list, test_size=0.03)

spam_train_bow = dict()
ham_train_bow = dict()

for subject in spam_train:
    spam_train_bow = convert_and_tokenze_string(subject, spam_train_bow, stopwords)
for subject in ham_train:
    ham_train_bow = convert_and_tokenze_string(subject, ham_train_bow, stopwords)


## Classifier

In [5]:
def calculate_likelihood(vocabulary, subject):
    total_words_in_vocabulary = np.sum(np.array(list(vocabulary.values())))
    likelihood = len(vocabulary) # chance of ham/spam
    tokens = convert_and_tokenze_string(subject, dict(), stopwords)

    for s in tokens:
        likelihood *= (vocabulary.get(s, 0)/total_words_in_vocabulary)**tokens[s]

    return likelihood

def bag_of_words_classifier(spam_bow, ham_bow, string):
    spam_likelihood = calculate_likelihood(spam_bow, string)
    ham_likelihood = calculate_likelihood(ham_bow, string)
    if spam_likelihood>ham_likelihood:
        return 'spam'
    else:
        return 'ham'

bag_of_words_classifier(spam_train_bow, ham_train_bow, spam_test[2])


'spam'

### check performance

In [6]:
out_spam=0
for test_spam_str in spam_test:
    out_spam+= int('spam' == bag_of_words_classifier(spam_train_bow, ham_train_bow, test_spam_str))
print(out_spam, len(spam_test))

out_ham = 0
for test_ham_str in ham_test:
    #print(bag_of_words_classifier(spam_train_bow, ham_train_bow, test_ham_str))
    out_ham+= int('ham' == bag_of_words_classifier(spam_train_bow, ham_train_bow, test_ham_str))

print(out_ham, len(ham_test))
print((out_spam+out_ham)/(len(spam_test) + len(ham_test)))

75 150
143 145
0.7389830508474576
