In [11]:
import string
from sklearn.model_selection import train_test_split, cross_val_score, KFold
import numpy as np 


## Load sms


In [12]:
spam_list = list()
ham_list = list()
def read_stopwords():
    """
    Read all the stopwords from the file "stopwords.txt".
    Returns the collection of stopwords.
    """
    with open("common-english-words.txt") as f:
        stopwords = [s.strip() for s in f.readlines()]

    return set(stopwords)

def convert_and_tokenze_string(s, tokens, stopwords):
    """
    Strips a word from all puncuation, whitespace. Then converts
    the word into all lower case.
    """
    #remove new lines and spaces behind and infront of the string
    for word in s.split():
        word = word.strip(string.punctuation \
            + string.whitespace  \
            + string.digits \
            ).lower()
        if len(word)>0 and word not in stopwords:
            if word in tokens:
                tokens[word]+=1
            else:
                tokens[word]=1
    return tokens

class spam_or_ham:
    def __init__(self, spam_list, ham_list):
        self.stopwords = read_stopwords()
        
        self.spam_bow = dict()
        self.ham_bow = dict()
        for subject in spam_list:
            self.spam_bow = convert_and_tokenze_string(subject, self.spam_bow, self.stopwords)
            
        for subject in ham_list:
            self.ham_bow = convert_and_tokenze_string(subject, self.ham_bow, self.stopwords)
  
        self.total_words_in_spam = np.sum(np.array(list(self.spam_bow.values())))
        self.total_words_in_ham = np.sum(np.array(list(self.ham_bow.values())))
        total_words = self.total_words_in_spam + self.total_words_in_ham
        self.probability_spam = self.total_words_in_spam / total_words
        self.probability_ham = self.total_words_in_ham / total_words
   
    def calculate_likelihood(self, bow, bow_size, sms):
        tokens = convert_and_tokenze_string(sms, dict(), self.stopwords)
        likelihood = 1
        for s in tokens:
            likelihood *= (bow.get(s, 0)/bow_size)**tokens[s]

        return likelihood

    def check_sms(self, sms):
        spam_likelihood = self.calculate_likelihood(self.spam_bow, self.total_words_in_spam, sms)
        ham_likelihood = self.calculate_likelihood(self.ham_bow, self.total_words_in_ham, sms)
        if spam_likelihood>ham_likelihood:
            return 'spam'
        else:
            return 'ham'


            

### Number of records

In [13]:
with open('SmsCollection.csv', encoding='utf-8',  errors='ignore') as email_file:
    for line in email_file:
        if line.startswith("spam;"):
            subject = line.lstrip("spam;")
            spam_list.append(subject)
        elif line.startswith("ham;"):
            subject = line.lstrip("ham;")
            ham_list.append(subject)

number_of_spam = len(spam_list)
number_of_ham = len(ham_list)
number_total = len(ham_list)+len(spam_list)
print(f'number of spam {number_of_spam}, number of ham {number_of_ham}, total {number_total}')

number of spam 747, number of ham 4827, total 5574


### Classifier

In [14]:
spam_train, spam_test = train_test_split(spam_list, random_state = 10, test_size=0.2)
ham_train, ham_test = train_test_split(ham_list, random_state = 10, test_size=0.2)

SH = spam_or_ham(spam_train, ham_train)

print(f'spam train {len(spam_train)}  spam test {len(spam_test)}  ')
print(f'ham train {len(ham_train)}  ham test {len(ham_test)}  ')

spam train 597  spam test 150  
ham train 598  ham test 150  


### check performance

In [15]:
out_spam=0

for test_spam_str in spam_test:
    out_spam += int('spam' == SH.check_sms(test_spam_str))
    
out_ham = 0
for test_ham_str in ham_test:
    out_ham += int('ham' == SH.check_sms(test_ham_str))

print(out_spam, len(spam_test), out_spam/len(spam_test))
print(out_ham, len(ham_test), out_ham/len(ham_test))

print((out_spam+out_ham)/(len(spam_test) + len(ham_test)))

79 150 0.5266666666666666
149 150 0.9933333333333333
0.76
