In [None]:
Willian de Mattos

In [4]:
import re

In [5]:
def tokenize(message):
    message = message.lower() # convert to lowercase
    all_words = re.findall("[a-z0-9']+", message) # extract the words
    return set(all_words) # remove duplicates

In [6]:
re.findall("[a-z0-9']+", "Life Insurance - Why Pay More?".lower())

['life', 'insurance', 'why', 'pay', 'more']

In [7]:
def count_words(training_set):
    """training set consists of pairs (message, is_spam)"""
    counts = defaultdict(lambda: [0, 0])
    for message, is_spam in training_set:
        for word in tokenize(message):
            counts[word][0 if is_spam else 1] += 1
    return counts

In [8]:
def word_probabilities(counts, total_spams, total_non_spams, k=0.5):
    """turn the word_counts into a list of triplets w, p(w | spam) and p(w | ~spam)"""
    return [(w,
            (spam + k) / (total_spams + 2 * k),
            (non_spam + k) / (total_non_spams + 2 * k))
            for w, (spam, non_spam) in counts.items()]


In [9]:
import math
def spam_probability(word_probs, message):
    message_words = tokenize(message)
    log_prob_if_spam = log_prob_if_not_spam = 0.0

    # iterate through each word in our vocabulary
    for word, prob_if_spam, prob_if_not_spam in word_probs:

        # if *word* appears in the message, add the log probability of seeing it
        if word in message_words:
            log_prob_if_spam += math.log(prob_if_spam)
            log_prob_if_not_spam += math.log(prob_if_not_spam)

        # if *word* doesn't appear in the message add the log probability of _not_ seeing it
        # which is log(1 - probability of seeing it)
        else:
            log_prob_if_spam += math.log(1.0 - prob_if_spam)
            log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)
    prob_if_spam = math.exp(log_prob_if_spam)
    prob_if_not_spam = math.exp(log_prob_if_not_spam)
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

In [10]:
class NaiveBayesClassifier:
    def __init__(self, k=0.5):
        self.k = k
        self.word_probs = []
    def train(self, training_set):
        # count spam and non-spam messages
        num_spams = len([is_spam
                        for message, is_spam in training_set
                            if is_spam])
        num_non_spams = len(training_set) - num_spams
        # run training data through our "pipeline"
        word_counts = count_words(training_set)
        self.word_probs = word_probabilities(word_counts,
                                            num_spams,
                                            num_non_spams,
                                            self.k)
    def classify(self, message):
        return spam_probability(self.word_probs, message)

In [12]:
import glob, re
path = r"./spamassassin/*/*"
data = []
# glob.glob returns every filename that matches the wildcarded path
for fn in glob.glob(path):
    is_spam = True if "ham" not in fn else False
    with open(fn,'r', encoding="utf8", errors='ignore') as file:
        for line in file:
            if line.startswith("Subject:"):
                # remove the leading "Subject: " and keep what's left
                subject = re.sub(r"^Subject: ", "", line).strip()
                data.append((subject, is_spam))

In [13]:
len(data)

3423

In [14]:
data[:5]

[('Re: New Sequences Window', False),
 ('[zzzzteana] RE: Alexander', False),
 ('[zzzzteana] Moscow bomber', False),
 ("[IRR] Klez: The Virus That  Won't Die", False),
 ('Re: Insert signature', False)]

In [15]:
import random
from collections import defaultdict

def split_data(data, prob):
    """divide os dados en frações [prob, 1 - prob]"""
    results = [], []
    for row in data:
        results[0 if random.random() < prob else 1].append(row)
    return results

In [16]:
random.seed(0)
train_data, test_data = split_data(data, 0.75)

In [17]:
print('Train data size=', len(train_data),
'Test data size=', len(test_data))

Train data size= 2547 Test data size= 876


In [18]:
# criando e treinando um classificador
classifier = NaiveBayesClassifier()
classifier.train(train_data)

In [19]:
print(classifier.classify("Life Insurance - Why Pay More?"))
print(classifier.classify("This week: Deck, Tex-Edit Plus, Boom"))
print(classifier.classify("Data Science Class"))

0.8159892501280763
0.07834054533624166
0.005007833873548472


In [20]:
from collections import Counter
import math

# triplets (subject, actual is_spam, predicted spam probability)
classified = [(subject, is_spam, classifier.classify(subject))
                for subject, is_spam in test_data]
# assume that spam_probability > 0.5 corresponds to spam prediction
# and count the combinations of (actual is_spam, predicted is_spam)
counts = Counter((is_spam, spam_probability > 0.5)
                for _, is_spam, spam_probability in classified)


In [21]:
print(counts)

Counter({(False, False): 704, (True, True): 101, (True, False): 38, (False, True): 33})


In [22]:
classified[:1]

[('Re: New Sequences Window', False, 1.7755526669480555e-05)]

In [23]:
# sort by spam_probability from smallest to largest
classified.sort(key=lambda row: row[2])
# the highest predicted spam probabilities among the non-spams
spammiest_hams_f = filter(lambda row: not row[1], classified)
spammiest_hams = list(spammiest_hams_f)[-5:]
# the lowest predicted spam probabilities among the actual spams
hammiest_spams_f = filter(lambda row: row[1], classified)
hammiest_spams = list(hammiest_spams_f)[:5]

In [24]:
print(spammiest_hams)

[('Attn programmers: support offered [FLOSS-Sarai Initiative]', False, 0.9759651625533735), ('2000+ year old Greek computer reinterpreted', False, 0.9837751869158617), ('What to look for in your next smart phone (Tech Update)', False, 0.9900202987412372), ('[ILUG-Social] Re: Important - reenactor insurance needed', False, 0.9995417850188201), ('[ILUG-Social] Re: Important - reenactor insurance needed', False, 0.9995417850188201)]


In [25]:
print(hammiest_spams)

[('Re: girls', True, 0.0009668117182064735), ('Introducing Chase Platinum for Students with a 0% Introductory APR', True, 0.0012755203538234656), ('.Message report from your contact page....//ytu855 rkq', True, 0.0015335954472965284), ('Testing a system, please delete', True, 0.0027323783458364755), ('Never pay for the goodz again (8SimUgQ)', True, 0.005999883704513356)]


In [26]:
def drop_final_s(word):
    return re.sub("s$", "", word)

In [27]:
drop_final_s('hands')

'hand'