In [20]:
import re #used to search edit a text / string
import math
import glob # used to read and scan files
import random

In [18]:
path = r"spam_assassin\*\*" # r for treating this backslash as backslash

data = []
i = 0
for fn in glob.glob(path):
    i += 1
    print(i, fn)
    is_spam = "ham" not in fn #those 
    
    with open(fn, 'r') as file:
        for line in file:
            if line.startswith("Subject:"):
                subject = re.sub(r"^Subject: ","", line).strip()
                data.append((subject, is_spam))

In [19]:
data[3:5]

[]

In [6]:
def split_data(data, prob):
    results = [], []
    for item in data:
         results[0 if random.random() < prob else 1].append(item)
#         if random.random() < prob:
#             results[0].append(item)
#         else:
#             results[1].append(item)
    return results

In [7]:
train_data, test_data = split_data(data, 0.75)

In [8]:
len(train_data), len(test_data)

(0, 0)

In [7]:
def tokenize(message):
    message = message.lower()
    all_words = re.findall("[a-z0-9]+", message)
    return set(all_words)

In [8]:
tokenize("Hello, How are you? My name is Utkarsh.")

{'are', 'hello', 'how', 'is', 'my', 'name', 'utkarsh', 'you'}

In [9]:
from collections import defaultdict

def count_words(training_set):
    counts = defaultdict(lambda: [0,0])
    for message, is_spam in training_set:
        for word in tokenize(message):
            counts[word][0 if is_spam else 1] += 1
    return counts

In [10]:
#count_words(train_data)

In [11]:
def word_probabilities(counts, total_spams, total_non_spams, k = 0.5):
    return [(w,
             (spam + k) / (total_spams + 2 * k),
             (non_spam + k) / (total_non_spams + 2 * k))
            for w, (spam, non_spam) in counts.items()]
#     probs = []
#     for w, (spam, non_spam) in counts.items():
#         p_spam = (spam + k) / (total_spams + 2*k)
#         p_non_spam = (non_spam + k) / (total_non_spams + 2*k)
#         probs.append(w, p_spam, p_non_spam)
        
#     return probs

In [12]:
def spam_probability(word_probs, message):
    message_words = tokenize(message)
    log_prob_if_spam = log_prob_if_not_spam = 0.0
    
    for word, prob_if_spam, prob_if_not_spam in word_probs:
        if word in message_words:
            log_prob_if_spam += math.log(prob_if_spam)
            log_prob_if_not_spam += math.log(prob_if_not_spam)
        else:
            log_prob_if_spam += math.log(1.0 - prob_if_spam)
            log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)
            
    prob_if_spam = math.exp(log_prob_if_spam)
    prob_if_not_spam = math.exp(log_prob_if_not_spam)
    
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

In [13]:
class NaiveBayesClassifier:
    
    def __init__(self, k = 0.5):
        self.k = 0.5
        self.word_probs = []
        
    def train(self, training_set):
        num_spams = len([is_spam
                        for message, is_spam in training_set
                        if is_spam])
        num_non_spams = len([is_spam
                        for message, is_spam in training_set
                        if not is_spam])
        total_messages = num_spams + num_non_spams
        word_counts = count_words(training_set)
        self.word_probs = word_probabilities(word_counts, num_spams,
                                             num_non_spams, self.k)
    def classify(self, message):
        return spam_probability(self.word_probs, message)

In [14]:
classifier = NaiveBayesClassifier()
classifier.train(train_data)

In [15]:
classifier.classify("I am Utkarsh.")

0.00828670136430909

In [16]:
classified = [(subject, is_spam, classifier.classify(subject))
              for subject, is_spam in test_data]

In [17]:
from collections import Counter
counts = Counter((is_spam, spam_probability > 0.5)
                 for subject, is_spam, spam_probability in classified)

In [18]:
counts

Counter({(False, False): 678,
         (False, True): 24,
         (True, False): 51,
         (True, True): 79})