In [32]:
import re
import pandas as pd
import string
from collections import defaultdict
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score


In [33]:
# Import dataset, spam ham data from Kaggle
data = pd.read_csv("emails.csv")

data.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [34]:
# Preprocessing
def process(text):
    processed = re.sub(r'\b\w{1,2}\b', '', text)  # removed all one and two lettered words
    processed = re.sub(r'\s+', ' ', processed)  # replaced all whitespaces with one space
    processed = processed.translate(str.maketrans('', '', string.punctuation))  # removing all the punctuations
    return processed.lower().strip()  # converted to lower case and removed starting and ending spaces

def tokenize(list, resultList):
    for sentences in list:
        resultList.append(sentences.split())
    return resultList

In [35]:
numberOfSpams = data[data['spam'] == 1]
totalSpams = numberOfSpams.shape[0]
totalRows = data.shape[0]

totalHams = totalRows - totalSpams

probSpam = totalSpams / totalRows
probHam = totalHams / totalRows

probHam, probSpam


(0.7611731843575419, 0.2388268156424581)

In [36]:
data['processedTexts'] = data['text'].apply(process)

# Splitting the data into training and test sets
trainData, testData = train_test_split(data, test_size=0.2, random_state=42)

# Extracting spam from ham, and tokenizing the words
spam = trainData[trainData['spam'] == 1]['processedTexts'].tolist()
ham = trainData[trainData['spam'] == 0]['processedTexts'].tolist()


spamSentences = []
hamSentences = []

# Separating each and every word, that is tokenizing it
tokenize(spam, spamSentences)
tokenize(ham, hamSentences)

[['subject',
  'request',
  'for',
  'payroll',
  'reclassification',
  'approved',
  'joann',
  'yes',
  'sorry',
  '413',
  'was',
  'the',
  'number',
  'the',
  'form',
  'received',
  'vince',
  'enron',
  'property',
  'services',
  'corp',
  'from',
  'joann',
  'holloway',
  '2000',
  'vince',
  'kaminski',
  'hou',
  'ect',
  'ect',
  'subject',
  'request',
  'for',
  'payroll',
  'reclassification',
  'approved',
  'vince',
  'your',
  'reclass',
  'information',
  'the',
  'company',
  'number',
  'indicated',
  'should',
  '0011',
  'not',
  '413',
  'ann',
  'holloway',
  '35957',
  'vince',
  'kaminski',
  '2000',
  'stella',
  'ely',
  'hou',
  'ect',
  'ect',
  'jeff',
  'kinneman',
  'hou',
  'ect',
  'ect',
  'carmen',
  'chavira',
  'hou',
  'ect',
  'ect',
  'michelle',
  'hargrave',
  'hou',
  'ect',
  'ect',
  'stephen',
  'wolfe',
  'hou',
  'ect',
  'ect',
  'michael',
  'galvan',
  'hou',
  'ect',
  'ect',
  'gary',
  'mccumber',
  'hou',
  'ect',
  'ect',
  '

In [37]:
# Define function to count what type of words are in spam and ham
def wordFrequencies(list):
    frequency = defaultdict(int)  # makes a dictionary with default value 0
    for sentences in list:
        for words in sentences:
            frequency[words] += 1  # if we find a word, increment it by 1
    return frequency

spamWordsFrequency = wordFrequencies(spamSentences)
hamWordsFrequency = wordFrequencies(hamSentences)

for key, value in spamWordsFrequency.items():    
    if value == max(spamWordsFrequency.values()):
        print(key, value)   # print the most occurring word
        break


the 7160


In [38]:
# Getting the probabilities
def calculateProbabilities(wordsAndFreq, totalWords, storeDictionary):
    for word, freq in wordsAndFreq.items():
        prob = freq / totalWords
        storeDictionary[word] = prob
    return storeDictionary
 

spamWordsProbs = {}
hamWordsProbs = {}

totalSpamWords = sum(spamWordsFrequency.values())
totalHamWords = sum(hamWordsFrequency.values())

calculateProbabilities(spamWordsFrequency, totalSpamWords, spamWordsProbs)
calculateProbabilities(hamWordsFrequency, totalHamWords, hamWordsProbs)


{'subject': 0.01022592578217136,
 'request': 0.0010061713835304374,
 'for': 0.016203240648129627,
 'payroll': 2.537820997035228e-05,
 'reclassification': 5.971343522435831e-06,
 'approved': 0.00011046985516506286,
 'joann': 2.0899702328525406e-05,
 'yes': 0.00023885374089743322,
 'sorry': 0.00033588807313701544,
 '413': 1.1942687044871661e-05,
 'was': 0.0022377609850328275,
 'the': 0.049660678404337584,
 'number': 0.0009061513795296373,
 'form': 0.0004299367336153798,
 'received': 0.0005896701728405383,
 'vince': 0.010021407266527933,
 'enron': 0.016053957060068732,
 'property': 5.075641994070456e-05,
 'services': 0.0006926758486025563,
 'corp': 0.0019765147059262597,
 'from': 0.006353509507871723,
 'holloway': 1.940686644791645e-05,
 '2000': 0.005780260529717884,
 'kaminski': 0.005562306491148976,
 'hou': 0.0064968217524101835,
 'ect': 0.013310124711509465,
 'your': 0.0066849190733669125,
 'reclass': 2.9856717612179153e-06,
 'information': 0.001900380076015203,
 'company': 0.001146497

In [43]:
# Preparing methods for the testing strings

def iterate(string):
    spamProb = math.log(probSpam)
    hamProb = math.log(probHam)

    tokens = string.split()

    for item in tokens: # this part is equivalent to P(message|spam/ham) formula
        spamProb += math.log(spamWordsProbs.get(item, 1.0 / (totalSpamWords + 1)))
        hamProb += math.log(hamWordsProbs.get(item, 1.0 / (totalHamWords + 1)))
        # print(item, spamProb, hamProb)

    return spamProb, hamProb

def checkHamOrSpam(string): # if probability of spam is higher, then the message is a spam
    processed = process(string)
    spamProb, hamProb = iterate(processed)
    return 'spam' if spamProb > hamProb else 'ham'

In [44]:
# Test against the the testing part of our data set
testData['prediction'] = testData['processedTexts'].apply(checkHamOrSpam)
accuracy = accuracy_score(testData['spam'], testData['prediction'].map({'spam': 1, 'ham': 0}))
precision = precision_score(testData['spam'], testData['prediction'].map({'spam': 1, 'ham': 0}))

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')

Accuracy: 0.9860383944153578
Precision: 0.9506578947368421


In [46]:
text = "Hey, I am free tomorrow. Drinks on me."
print(text, "\n This message is: ", checkHamOrSpam(text))
text = "Congratulations! You've won a $1,000 gift card. Click here to claim your prize now!"
print(text, "\n This message is: ", checkHamOrSpam(text))

Hey, I am free tomorrow. Drinks on me. 
 This message is:  ham
Congratulations! You've won a $1,000 gift card. Click here to claim your prize now! 
 This message is:  spam
