In [16]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import string

##**Read Data**

In [17]:

from google.colab import files

uploaded = files.upload()  # Opens a file chooser

Saving spam.csv to spam (1).csv


In [18]:
#read dataset
spam_df = pd.read_csv('spam.csv', encoding="ISO-8859-1")

#keep only 2 cols: v1, v2
spam_df = spam_df[['v1', 'v2']]
# rename cols
# inplace=True: the changes are applied directly
spam_df.rename(columns={'v1': 'spam', 'v2': 'text'}, inplace=True)

#convert spam column to bool (if spam, then = True)
spam_df.spam = spam_df.spam.apply(lambda s: True if s=='spam' else False)

#lowercase everything and remove punctuation
spam_df.text = spam_df.text.apply(lambda t: t.lower().translate(str.maketrans('', '', string.punctuation)))

#shuffle
# frac=1: take 100% rows to shuffle
spam_df = spam_df.sample(frac=1)
spam_df

Unnamed: 0,spam,text
4830,False,po de no need job aha
4831,False,rats hey did u ever vote for the next themes
1070,False,alright ill make sure the car is back tonight
4894,True,want the latest video handset 750 anytime any ...
1098,False,no gifts you trying to get me to throw myself ...
...,...,...
4752,True,cashbincouk get lots of cash this weekend wwwc...
1338,False,aight sorry i take ten years to shower whats t...
4916,True,this is the 2nd time we have tried 2 contact u...
1137,False,lol no just was busy


In [19]:
# in spam table, take rows in which spam = True => take the first 5 rows => take their text => print
for t in spam_df[spam_df.spam == True].iloc[:5].text:
    print(t)
    print('-------')

want the latest video handset 750 anytime any network mins half price line rental reply or call 08000930705 for delivery tomorrow
-------
449071512431 urgent this is the 2nd attempt to contact uu have won å£1250 call 09071512433 b4 050703 tcsbcm4235wc1n3xx callcost 150ppm mobilesvary maxå£7 50
-------
smsservices for yourinclusive text credits pls goto wwwcomuknet login 3qxj9 unsubscribe with stop no extra charge help 08702840625comuk 220cm2 9ae
-------
datingi have had two of these only started after i sent a text to talk sport radio last week any connection do you think or coincidence
-------
u have a secret admirer who is looking 2 make contact with ufind out who they rreveal who thinks ur so specialcall on 09065171142stopsms08718727870150ppm
-------


In [20]:
for t in spam_df[spam_df.spam == False].iloc[:5].text:
    print(t)
    print('-------')

po de  no need job aha
-------
rats hey did u ever vote for the next themes
-------
alright ill make sure the car is back tonight
-------
no gifts you trying to get me to throw myself off a cliff or something
-------
oh my god im almost home
-------


##**Train/Test Split**

In [21]:
#get training set
# len(spam_df)*0.7: take the length of the whole table => *0.7 => convert to int (we dont take decimal num of rows) => take the first 70% of
#                                                                                                                      the rows
train_spam_df = spam_df.iloc[:int(len(spam_df)*0.7)]

#get testing set
# iloc[n:]: take the rows from n to the end
test_spam_df = spam_df.iloc[int(len(spam_df)*0.7):]

In [22]:
# the % of spam text
FRAC_SPAM_TEXTS = train_spam_df.spam.mean()
print(FRAC_SPAM_TEXTS)

0.13230769230769232


##**Create a Spam BOW & Non-Spam BOW**

In [23]:
#get all words from spam and non-spam datasets
# join all the text from spams => split them into each word
train_spam_words = ' '.join(train_spam_df[train_spam_df.spam == True].text).split(' ')
train_non_spam_words = ' '.join(train_spam_df[train_spam_df.spam == False].text).split(' ')

# create common_words by intersecting these 2 subsets
common_words = set(train_spam_words).intersection(set(train_non_spam_words))

In [27]:
train_spam_bow = dict()
for w in common_words:
    train_spam_bow[w] = train_spam_words.count(w) / len(train_spam_words)

In [24]:
# create an empty dictionary to store the spam word & its frequency
train_non_spam_bow = dict()
for w in common_words:
  # count how many times the spam word appears => /the total of spam words => probability
  # key is the spam word, value is the prob
    train_non_spam_bow[w] = train_non_spam_words.count(w) / len(train_non_spam_words)

##**Predict**

In [28]:
# verbose = False: no output, just run quietly
def predict_text(t, verbose=False):
    #if some word doesnt appear in either spam or non-spam BOW, disregard it
    # keep the spam/non-spam words that are only in the training set
    valid_words = [w for w in t if w in train_spam_bow]

    #get the probabilities of each valid word showing up in spam and non-spam BOW
    spam_probs = [train_spam_bow[w] for w in valid_words]
    non_spam_probs = [train_non_spam_bow[w] for w in valid_words]

    #print probs if requested
    if verbose:
        data_df = pd.DataFrame()
        data_df['word'] = valid_words
        data_df['spam_prob'] = spam_probs
        data_df['non_spam_prob'] = non_spam_probs
        # zip: loop thru 2 lists at the same time
        # for each loop/row, cal spam/non-spam (if non-spam=0, return np.inf - infinity)
        data_df['ratio'] = [s/n if n > 0 else np.inf for s,n in zip(spam_probs, non_spam_probs)]
        print(data_df)

    #calculate spam score as sum of logs for all probabilities
    spam_score = sum([np.log(p) for p in spam_probs]) + np.log(FRAC_SPAM_TEXTS)

    #calculate non-spam score as sum of logs for all probabilities
    non_spam_score = sum([np.log(p) for p in non_spam_probs]) + np.log(1-FRAC_SPAM_TEXTS)

    #if verbose, report the two scores
    if verbose:
        print('Spam Score: %s'%spam_score)
        print('Non-Spam Score: %s'%non_spam_score)

    #if spam score is higher, mark this as spam
    return (spam_score >= non_spam_score)

In [29]:
predict_text('urgent call this number'.split(), verbose=True)

     word  spam_prob  non_spam_prob       ratio
0  urgent   0.003340       0.000020  163.265113
1    call   0.020613       0.003458    5.961333
2    this   0.004807       0.003621    1.327359
3  number   0.001548       0.000798    1.939986
Spam Score: -23.414641019928354
Non-Spam Score: -29.36047504751073


np.True_