### Load data

In [1]:
!curl http://www.dt.fee.unicamp.br/~tiago/smsspamcollection/smsspamcollection.zip --output spam.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  205k  100  205k    0     0  76780      0  0:00:02  0:00:02 --:--:-- 76776


In [2]:
!unzip spam.zip

Archive:  spam.zip
  inflating: readme                  
  inflating: SMSSpamCollection.txt   


In [3]:
!head SMSSpamCollection.txt

ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
ham	Ok lar... Joking wif u oni...
spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
ham	U dun say so early hor... U c already then say...
ham	Nah I don't think he goes to usf, he lives around here though
spam	FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv
ham	Even my brother is not like to speak with me. They treat me like aids patent.
ham	As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
spam	WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only

In [4]:
!cat SMSSpamCollection.txt | wc -l

    5574


### Parse data

In [18]:
from collections import namedtuple

CorpusItem = namedtuple('CorpusItem', 'text spam')

def parse_corpus(filename):
    items = []
    with open(filename) as corpus:
        for line in corpus:
            parts = line.split('\t')
            spam = False
            if parts[0] == 'spam':
                spam = True
            items.append(CorpusItem(text = parts[1], spam = spam))
    return items

In [19]:
corpus = parse_corpus('SMSSpamCollection.txt')

In [20]:
corpus[:10]

[CorpusItem(text='Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...\n', spam=False),
 CorpusItem(text='Ok lar... Joking wif u oni...\n', spam=False),
 CorpusItem(text="Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\n", spam=True),
 CorpusItem(text='U dun say so early hor... U c already then say...\n', spam=False),
 CorpusItem(text="Nah I don't think he goes to usf, he lives around here though\n", spam=False),
 CorpusItem(text="FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv\n", spam=True),
 CorpusItem(text='Even my brother is not like to speak with me. They treat me like aids patent.\n', spam=False),
 CorpusItem(text="As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune 

### Split data

In [179]:
spam = [item for item in corpus if item.spam]
ham = [item for item in corpus if not item.spam]

In [180]:
spam_ratio = len(spam)/len(corpus)
ham_ratio = 1 - spam_ratio

spam_index = int(spam_ratio * 800)
ham_index = int(ham_ratio * 800)

print(spam_index, ham_index)

107 692


In [181]:
test_corpus = spam[:spam_index] + ham[:ham_index]
train_corpus = spam[spam_index + 1:] + ham[ham_index +1:]

### Extract features

In [35]:
import spacy

nlp = spacy.load('en_core_web_md')

In [254]:
import nltk
import re

def extract_features(text):
    features = []
    doc = nlp(text.strip())
    for token in doc:
        if token.pos_ != 'PUNCT' or token.text == '!':
            features.append(re.sub(r'\d', 'N', token.lemma_.lower()))
    return features

In [258]:
spam_dist = nltk.FreqDist([word for item in train_corpus if item.spam for word in extract_features(item.text)])


In [259]:
spam_dist

FreqDist({'-pron-': 710, 'to': 580, '!': 431, 'be': 331, 'a': 328, 'N': 326, 'call': 323, 'NNNNNNNNNNN': 317, '£': 228, 'NNN': 218, ...})

In [260]:
ham_dist = nltk.FreqDist([word for item in train_corpus if not item.spam for word in extract_features(item.text)])

In [261]:
ham_dist

FreqDist({'-pron-': 7019, 'be': 2509, 'to': 1398, 'the': 956, 'not': 900, 'a': 900, 'do': 871, 'and': 733, '!': 703, 'in': 700, ...})

In [211]:
spam_prob = len([item for item in train_corpus if item.spam])/len(train_corpus)
ham_prob = 1 - spam_prob

print (spam_prob, ham_prob)

0.13387806411062225 0.8661219358893777


In [252]:
import math
import sys

def predict(text):
    tokens = extract_features(text)
    maybe_spam = math.log(spam_prob) + sum([math.log((spam_dist.freq(token) + sys.float_info.epsilon)) for token in tokens])
    maybe_ham = math.log(ham_prob) + sum([math.log((ham_dist.freq(token) + sys.float_info.epsilon)) for token in tokens])

    if maybe_spam > maybe_ham:
        return True
    return False

In [262]:
tp = 0
tn = 0
fp = 0
fn = 0

for item in test_corpus:
    spam = predict(item.text)
    if spam and item.spam:
        tp += 1
    elif spam and not item.spam:
        fp += 1
    elif not spam and item.spam:
        fn += 1
    else:
        tn += 1

print(tp, fp, fn, tn)        

pr = tp/(tp + fp)
rc = tp/(tp + fn)
f1 = 2 * (pr * rc)/(pr + rc)

print("Accuracy: {}".format((tp + tn)/(tp+fp+fn+tn)))
print("Precicison: {}".format(pr))
print("Recall: {}".format(rc))
print("F1: {}".format(f1))

101 4 6 688
Accuracy: 0.9874843554443054
Precicison: 0.9619047619047619
Recall: 0.9439252336448598
F1: 0.9528301886792454
