In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import string

In [2]:
#read dataset downloaded from kaggle 
spam_df= pd.read_csv("/Users/vanshika/Downloads/archive/spam.csv", encoding= 'ISO-8859-1')

#subset data in first two columns as spam_df
spam_df= spam_df[['v1','v2']] #when data was used and spam_df was created as subset from data, it copied the contents of the data
                            # to make a new df and so fixed methods like renaming columns for a copy dataset raised errors
#so all names of the dataset are changed as one variable name

#rename column names of new dataset
spam_df.rename(columns= {'v1': 'spam', 'v2': 'text'}, inplace=True) #returns a copy of the dataframe and 
                                                                    # not returns just a view of the df

#convert spam column to binary for two class classification model of spam or not spam
spam_df['spam']= spam_df['spam'].apply(lambda s: True if s=='spam' else False)

#lowercase the text column and remove punctuation 
spam_df.text= spam_df.text.apply(lambda t: t.lower().translate(str.maketrans('','', string.punctuation)))

#randomize the data
spam_df= spam_df.sample(frac=1) #Return a random sample of items from an axis of object and frac is the proportion of it

In [3]:
spam_df

Unnamed: 0,spam,text
1356,False,u ned to convince him tht its not possible wit...
454,True,loan for any purpose å£500 å£75000 homeowners...
1900,False,and miss vday the parachute and double coins u...
4223,False,double eviction this week spiral and michael ...
4847,False,sthis will increase the chance of winning
...,...,...
1059,True,eastenders tv quiz what flower does dot compar...
3309,False,oh ho is this the first time u use these type ...
1520,True,urgent your mobile no was awarded a å£2000 bon...
2161,False,is she replying has boye changed his phone number


In [4]:
#to see the text of the 'spam' data
for t in spam_df[spam_df.spam== True].iloc[:5].text:
    print(t,'\n')

loan for any purpose å£500  å£75000 homeowners  tenants welcome have you been previously refused we can still help call free 0800 1956669 or text back help 

win a year supply of cds 4 a store of ur choice worth å£500  enter our å£100 weekly draw txt music to 87066 tscs wwwldewcomsubs161win150ppmx3 

monthly password for wap mobsicom is 391784 use your wap phone not pc 

from next month get upto 50 more calls 4 ur standard network charge 2 activate call 9061100010 c wire3net 1st4terms pobox84 m26 3uz cost å£150 min mobcudb more 

free for 1st week no1 nokia tone 4 ur mob every week just txt nokia to 8007 get txting and tell ur mates wwwgetzedcouk pobox 36504 w45wq norm150ptone 16 



In [5]:
#to see the text of the 'not spam' data
for t in spam_df[spam_df.spam== False].iloc[:5].text:
    print(t,'\n')

u ned to convince him tht its not possible witot hurting his feeling its the main 

and miss vday the parachute and double coins u must not know me very well 

double eviction this week  spiral and michael and good riddance to them 

sthis will increase the chance of winning 

im stuck in da middle of da row on da right hand side of da lt  



In [6]:
#split the dataset into training and testing
train= spam_df.iloc[: int(len(spam_df)*0.7 )]
test= spam_df.iloc[int(len(spam_df) *0.7) : ]

In [7]:
test.shape, train.shape

((1672, 2), (3900, 2))

In [8]:
#estimate= the percentage of spam texts in the training data- prior probablity
spam_text_perc= train.spam.mean()
spam_text_perc*100

13.230769230769232

In [9]:
#creating 2 bag of words from spam texts and other texts as spam bag and not spam bag
train_spam_bag= ' '.join(train[train.spam==True].text).split(' ')
train_not_spam_bag= ' '.join(train[train.spam==False].text).split(' ')

In [10]:
train_spam_bag[:5], train_not_spam_bag[:5]

(['loan', 'for', 'any', 'purpose', 'å£500'],
 ['u', 'ned', 'to', 'convince', 'him'])

In [11]:
#find the common unrepeated words in both these bags
common_words= set(train_spam_bag).intersection(set(train_not_spam_bag))

In [12]:
common_words.remove('')
common_words

{'1',
 '10',
 '100',
 '1000s',
 '11',
 '12',
 '16',
 '1st',
 '2',
 '20',
 '2000',
 '21',
 '21st',
 '25',
 '26th',
 '2day',
 '2nd',
 '2nite',
 '2u',
 '3',
 '4',
 '4u',
 '5',
 '50',
 '6',
 '7',
 '8',
 '8pm',
 'a',
 'about',
 'access',
 'accident',
 'account',
 'added',
 'address',
 'adult',
 'afraid',
 'after',
 'again',
 'age',
 'ages',
 'all',
 'allow',
 'alone',
 'already',
 'also',
 'am',
 'among',
 'amy',
 'an',
 'and',
 'another',
 'ans',
 'answer',
 'any',
 'anytime',
 'app',
 'apply',
 'approx',
 'are',
 'area',
 'around',
 'arrange',
 'arrive',
 'as',
 'asap',
 'asked',
 'at',
 'attempt',
 'august',
 'available',
 'ave',
 'await',
 'away',
 'b',
 'b4',
 'babe',
 'babes',
 'baby',
 'back',
 'bad',
 'balance',
 'bank',
 'be',
 'because',
 'become',
 'bedroom',
 'been',
 'before',
 'beg',
 'being',
 'believe',
 'ben',
 'benefits',
 'best',
 'better',
 'between',
 'bids',
 'big',
 'biggest',
 'bill',
 'birds',
 'birthday',
 'bitch',
 'black',
 'bloke',
 'blow',
 'book',
 'bored',
 '

In [13]:
#find the true probability of each spam word and not spam word in the training set- number of occurences/ total number of words
train_spam_bow={}
train_not_spam_bow={}

for w in common_words:
    train_spam_bow[w]= train_spam_bag.count(w)/ len(train_spam_bag)
    train_not_spam_bow[w]= train_not_spam_bag.count(w)/ len(train_not_spam_bag)
    

In [14]:
train_spam_bow

{'can': 0.001459498905375821,
 'holiday': 0.0016216654504175789,
 'youve': 0.0001621665450417579,
 'stay': 0.0001621665450417579,
 'child': 8.108327252087895e-05,
 'open': 8.108327252087895e-05,
 'does': 0.0001621665450417579,
 'yesterday': 0.0001621665450417579,
 'other': 0.0001621665450417579,
 'previous': 8.108327252087895e-05,
 'direct': 0.0007297494526879105,
 '21st': 0.0001621665450417579,
 'totally': 8.108327252087895e-05,
 'arrive': 0.0004054163626043947,
 'processed': 8.108327252087895e-05,
 'most': 0.0001621665450417579,
 'pound': 0.0004054163626043947,
 'end': 0.0008919159977296684,
 'stuff': 8.108327252087895e-05,
 'way': 8.108327252087895e-05,
 'picked': 0.0001621665450417579,
 'selected': 0.0016216654504175789,
 'yahoo': 0.0001621665450417579,
 '16': 0.002756831265709884,
 'blow': 8.108327252087895e-05,
 'feeling': 8.108327252087895e-05,
 '10': 0.0001621665450417579,
 'dates': 0.0001621665450417579,
 'about': 0.0003243330900835158,
 'away': 0.00024324981756263683,
 'top':

In [15]:
train_not_spam_bow

{'can': 0.005358489028340906,
 'holiday': 0.00018337034697744545,
 'youve': 0.00012224689798496363,
 'stay': 0.000325991727959903,
 'child': 2.0374482997493937e-05,
 'open': 0.00018337034697744545,
 'does': 0.0005297365579348424,
 'yesterday': 0.000325991727959903,
 'other': 0.0007334813879097818,
 'previous': 4.0748965994987875e-05,
 'direct': 2.0374482997493937e-05,
 '21st': 2.0374482997493937e-05,
 'totally': 2.0374482997493937e-05,
 'arrive': 2.0374482997493937e-05,
 'processed': 2.0374482997493937e-05,
 'most': 0.0004074896599498788,
 'pound': 4.0748965994987875e-05,
 'end': 0.0004278641429473727,
 'stuff': 0.0005908600069273242,
 'way': 0.0015077117418145515,
 'picked': 6.112344899248182e-05,
 'selected': 6.112344899248182e-05,
 'yahoo': 0.00012224689798496363,
 '16': 2.0374482997493937e-05,
 'blow': 2.0374482997493937e-05,
 'feeling': 0.0002648682789674212,
 '10': 0.00014262138098245758,
 'dates': 2.0374482997493937e-05,
 'about': 0.0021189462317393695,
 'away': 0.00026486827896

In [16]:
def predict_text(t, verbose=False):
    #if some words are not in either of the list of spam and not spam then consider them out of this list-
    unvisited=[w for w in t if w in train_spam_bow] #train_spam_bow is based on common words, w acts as key
    
    #calc the prob of each spam word and not spam word for the given text
    spam_prob=[train_spam_bow[w] for w in unvisited]
    not_spam_probs= [train_not_spam_bow[w] for w in unvisited]
    
    #print probabilities if asked by user
    if verbose:
        data= pd.DataFrame()
        data['words']= unvisited
        data['spam words prob']= spam_prob
        data['other words prob']= not_spam_probs
        data['ratio'] = [s/n if n >0 else np.inf for s, n in zip(spam_prob, not_spam_probs)]
        print(data)
        
    #calc spam and not spam score as sum of all log prob
    spam_score= sum([np.log(p) for p in spam_prob])+ np.log(spam_text_perc)
    nspam_score= sum([np.log(p) for p in not_spam_probs]) + np.log(1- spam_text_perc)
    
    if verbose:
        print('Spam score: %s'%spam_score)
        print('Not Spam score: %s'%nspam_score)
    #if spam score is higher than non spam score then return it, marking it as spam
    return (spam_score >= nspam_score)

In [18]:
predict_text('hi buy this urgent to win loan, and free money'.split(), verbose=True)

    words  spam words prob  other words prob       ratio
0      hi         0.000892          0.001732    0.515013
1     buy         0.000162          0.000754    0.215116
2    this         0.004784          0.003667    1.304440
3  urgent         0.003487          0.000020  171.124868
4      to         0.039650          0.022412    1.769134
5     win         0.003649          0.000081   44.771041
6     and         0.007135          0.012408    0.575056
7    free         0.011757          0.000856   13.739261
8   money         0.000324          0.000713    0.454817
Spam score: -55.03376003650861
Not Spam score: -62.01224178561652


True

In [19]:
test

Unnamed: 0,spam,text
1125,True,for taking part in our mobile survey yesterday...
3532,False,sorry ill call later
431,False,does she usually take fifteen fucking minutes ...
295,True,tmobile customer you may now claim your free c...
4274,False,haiyoh maybe your hamster was jealous of million
...,...,...
1059,True,eastenders tv quiz what flower does dot compar...
3309,False,oh ho is this the first time u use these type ...
1520,True,urgent your mobile no was awarded a å£2000 bon...
2161,False,is she replying has boye changed his phone number


In [20]:
predictions= test.text.apply(lambda p: predict_text(p.split()))

In [21]:
#correct predictions as spam message
correct= np.sum((predictions== True) & (test.spam== True)) / np.sum(test.spam==True)

In [22]:
print('correct detection: ',correct*100)

correct detection:  90.04329004329004


In [23]:
#valid messages marked as spam 
incorrect= np.sum((predictions== True) & (test.spam== False)) / np.sum(test.spam==False)
print('incorrect detection: ',incorrect*100)

incorrect detection:  1.6655100624566272
