https://github.com/Midvel/medium_jupyter_notes/blob/master/naive_bayes_filter/bayes-classificator.ipynb

https://towardsdatascience.com/how-to-build-and-apply-naive-bayes-classification-for-spam-filtering-2b8d3308501

In [None]:
import pandas as pd

#Import Library

Dataset from https://archive.ics.uci.edu/ml/datasets/sms+spam+collection

In [None]:
sms_data = pd.read_csv('SMSSpamCollection', header=None, sep='\t', names=['Label', 'SMS'])

In [None]:
sms_data.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
sms_data.shape

(5572, 2)

In [None]:
sms_data.loc[30,:]

Unnamed: 0,30
Label,ham
SMS,"Wait that's still not all that clear, were you..."


In [None]:
sms_data.loc[30,'SMS']

"Wait that's still not all that clear, were you not sure about me being sarcastic or that that's why x doesn't want to live with us"

In [None]:
sms_data.groupby('Label').count()

Unnamed: 0_level_0,SMS
Label,Unnamed: 1_level_1
ham,4825
spam,747


Prepare data

In [None]:
sms_data_clean = sms_data.copy()

In [None]:
sms_data_clean['SMS'] = sms_data_clean['SMS'].str.replace('\W+', ' ').str.replace('\s+', ' ').str.strip()

In [None]:
sms_data_clean['SMS'] = sms_data_clean['SMS'].str.lower()

In [None]:
sms_data_clean['SMS'] = sms_data_clean['SMS'].str.split()

In [None]:
sms_data_clean['SMS'].head()

Unnamed: 0,SMS
0,"[go, until, jurong, point,, crazy.., available..."
1,"[ok, lar..., joking, wif, u, oni...]"
2,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,"[u, dun, say, so, early, hor..., u, c, already..."
4,"[nah, i, don't, think, he, goes, to, usf,, he,..."


In [None]:
sms_data.loc[2,'SMS']

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

In [None]:
sms_data_clean.loc[2,'SMS']

['free',
 'entry',
 'in',
 '2',
 'a',
 'wkly',
 'comp',
 'to',
 'win',
 'fa',
 'cup',
 'final',
 'tkts',
 '21st',
 'may',
 '2005.',
 'text',
 'fa',
 'to',
 '87121',
 'to',
 'receive',
 'entry',
 'question(std',
 'txt',
 "rate)t&c's",
 'apply',
 "08452810075over18's"]

In [None]:
sms_data_clean['Label'].value_counts() / sms_data.shape[0] * 100

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
ham,86.593683
spam,13.406317


Split to train and test data

#Training Data 60%

In [None]:
train_data = sms_data_clean.sample(frac=0.6,random_state=1).reset_index(drop=True)
test_data = sms_data_clean.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)

In [None]:
train_data['Label'].value_counts() / train_data.shape[0] * 100

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
ham,86.180078
spam,13.819922


In [None]:
train_data.shape

(3343, 2)

In [None]:
test_data['Label'].value_counts() / test_data.shape[0] * 100

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
ham,86.496187
spam,13.503813


In [None]:
test_data.shape

(2229, 2)

In [None]:
test_data.head()

Unnamed: 0,Label,SMS
0,ham,"[you, will, go, to, walmart., i.ll, stay.]"
1,ham,"[i, haven't, forgotten, you,, i, might, have, ..."
2,ham,"[oh, great., i.ll, disturb, him, more, so, tha..."
3,ham,"[reverse, is, cheating., that, is, not, mathem..."
4,ham,"[u're, welcome..., caught, u, using, broken, e..."


Prepare vocabulary - the list fo all the words from the dataset

In [None]:
vocabulary = list(set(train_data['SMS'].sum()))

In [None]:
vocabulary[11:20]

['87239',
 '62220cncl',
 'six',
 'great.',
 'sptv:',
 'worried',
 '(send',
 'crab..',
 'movie']

In [None]:
len(vocabulary)

9996

Calculate frequencies fo the words for each message

In [None]:
word_counts_per_sms = pd.DataFrame([
    [row[1].count(word) for word in vocabulary]
    for _, row in train_data.iterrows()], columns=vocabulary)

  [row[1].count(word) for word in vocabulary]


In [None]:
train_data = pd.concat([train_data.reset_index(), word_counts_per_sms], axis=1).iloc[:,1:]

In [None]:
train_data.head()

Unnamed: 0,Label,SMS,us.,ttyl,information.,hw'd,held,"motorola,",yes-440,busy,...,arabian,mate.,april,"end,",mum's,paining,pass,check.,spoken,09064018838.
0,ham,"[yep,, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes,, princess., are, you, going, to, make, m...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent.],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth.., there's, a...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
train_data['i']

Unnamed: 0,i
0,0
1,0
2,0
3,0
4,1
...,...
3338,0
3339,2
3340,0
3341,1


Calculate values for the Bayes formula

In [None]:
alpha = 1

In [None]:
Nvoc = len(train_data.columns) - 3

In [None]:
Pspam = train_data['Label'].value_counts()['spam'] / train_data.shape[0]

In [None]:
Pham = train_data['Label'].value_counts()['ham'] / train_data.shape[0]

In [None]:
Nspam = train_data.loc[train_data['Label'] == 'spam', 'SMS'].apply(len).sum()

In [None]:
Nham = train_data.loc[train_data['Label'] == 'ham', 'SMS'].apply(len).sum()

In [None]:
Nspam

10915

In [None]:
Nham

41913

#Predicting

In [None]:
def p_w_spam(word):
    if word in train_data.columns:
        return (train_data.loc[train_data['Label'] == 'spam', word].sum() + alpha) / (Nspam + alpha*Nvoc)
    else:
        return 1

In [None]:
def p_w_ham(word):
    if word in train_data.columns:
        return (train_data.loc[train_data['Label'] == 'ham', word].sum() + alpha) / (Nham + alpha*Nvoc)
    else:
        return 1

In [None]:
def classify(message):
    p_spam_given_message = Pspam
    p_ham_given_message = Pham
    for word in message:
        p_spam_given_message *= p_w_spam(word)
        p_ham_given_message *= p_w_ham(word)
    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_ham_given_message < p_spam_given_message:
        return 'spam'
    else:
        return 'needs human classification'

In [None]:
classify('secret')

'ham'

In [None]:
classify(['secret', 'source', 'of', 'infinite', 'power'])

'ham'

In [None]:
classify(['free', 'win', 'welcom'])

'spam'

##My Predicting

In [None]:
classify(['lottery', 'reward'])

'spam'

In [None]:
classify(['final', 'moment', 'change', 'city', 'bid'])

'spam'

In [None]:
classify('good')

'ham'

In [None]:
classify(['zero', 'boy', 'fantasy', 'stage'])

'ham'

In [None]:
classify(['deal', 'decide'])

'ham'

In [None]:
classify(['prostitution', 'notice', 'big', 'teenage'])

'ham'

#Test data

In [None]:
test_data['predicted'] = test_data['SMS'].apply(classify)

In [None]:
test_data.head()

Unnamed: 0,Label,SMS,predicted
0,ham,"[aight, should, i, just, plan, to, come, up, l...",ham
1,ham,"[die, i, accidentally, deleted, e, msg, i, sup...",ham
2,spam,"[welcome, to, uk, mobile, date, this, msg, is,...",spam
3,ham,"[this, is, wishing, you, a, great, day, moji, ...",ham
4,ham,"[thanks, again, for, your, reply, today, when,...",ham


In [None]:
correct = (test_data['predicted'] == test_data['Label']).sum() / test_data.shape[0] * 100

In [None]:
test_data.loc[test_data['predicted'] != test_data['Label']]

Unnamed: 0,Label,SMS,predicted
56,spam,"[money, i, have, won, wining, number, 946, wot...",ham
99,ham,"[gettin, rdy, to, ship, comp]",spam
142,ham,"[have, you, laid, your, airtel, line, to, rest]",spam
218,spam,"[hi, babe, its, chloe, how, r, u, i, was, smas...",ham
245,ham,[anytime],spam
404,ham,"[nokia, phone, is, lovly]",spam
491,spam,"[hi, this, is, amy, we, will, be, sending, you...",ham
588,ham,"[we, have, sent, jd, for, customer, service, c...",spam
646,ham,"[a, boy, loved, a, gal, he, propsd, bt, she, d...",needs human classification
912,spam,"[dating, i, have, had, two, of, these, only, s...",ham


In [None]:
correct

99.10233393177738

#Assignment 1

\begin{array}{|c|c|} \hline
 & Spam && No-Spam \\ \hline
Total & 25 && 75\\
Buy & 20 & 4/5 & 5 & 1/15 \\
Cheap & 15 & 3/5 & 10 & 2/15\\
Work & 5 & 1/5 & 30 & 6/15\\
Free & 20 & 4/5 & 7 & 7/75 \\
Buy, Cheap, Work, Free & 48/25 & 48/625 & 28/1125 & 84/253125\\
\hline
\end{array}

$${\frac{\frac{48}{25}}{\frac{48}{25} + \frac{28}{1125}} =  \frac{540}{547} = 0.987 percent}$$

#Assignment 2

\begin{array}{|c|c|} \hline
 & Spam && No-Spam \\ \hline
Total & 25 && 75\\
Buy & 20 & 4/5 & 5 & 1/15 \\
Cheap & 15 & 3/5 & 10 & 2/15\\
Work & 5 & 1/5 & 30 & 6/15\\
Free & 20 & 4/5 & 7 & 7/75 \\
Will & 4 & 4/25 & 40 & 8/15 \\
Buy, Cheap, Work, Free, Will & 192/625 & 192/15625 & 224/16875 & 224/1265625\\
\hline
\end{array}

$${\frac{\frac{192}{625}}{\frac{192}{625} + \frac{224}{16875}} =  \frac{162}{169} = 0.959 percent}$$