# Classification of messages as spam or not spam using Naive Bayes algorithm

In [15]:
import pandas as pd
import numpy as np

# Import Dataset - upload the SMS text file to the content folder on the left panel before running
df = pd.read_table('SMS', sep='\t', header=None, names=['label', 'sms_message'])
df

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [16]:
# map the 'ham' value to 0 and the 'spam' value to 1.
df['label_binary'] = df.label.map({'ham':0,'spam':1})
df.head()

Unnamed: 0,label,sms_message,label_binary
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [17]:
# Get stats
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [18]:
#  data cleaning
df['sms_message'] = df['sms_message'].str.replace(r'[\W_]+', ' ', regex=True).str.strip() # Removes punctuation and leading/trailing spaces
df['sms_message'] = df['sms_message'].str.lower() ### making all the words lowercase
df.head(10)

Unnamed: 0,label,sms_message,label_binary
0,ham,go until jurong point crazy available only in ...,0
1,ham,ok lar joking wif u oni,0
2,spam,free entry in 2 a wkly comp to win fa cup fina...,1
3,ham,u dun say so early hor u c already then say,0
4,ham,nah i don t think he goes to usf he lives arou...,0
5,spam,freemsg hey there darling it s been 3 week s n...,1
6,ham,even my brother is not like to speak with me t...,0
7,ham,as per your request melle melle oru minnaminun...,0
8,spam,winner as a valued network customer you have b...,1
9,spam,had your mobile 11 months or more u r entitled...,1


In [19]:
# Randomly shuffle the records in the dataset to avoid bias
df = df.sample(frac=1, random_state=1)
df.head(10)

Unnamed: 0,label,sms_message,label_binary
1078,ham,yep by the pretty sculpture,0
4028,ham,yes princess are you going to make me moan,0
958,ham,welp apparently he retired,0
4642,ham,havent,0
4674,ham,i forgot 2 ask ü all smth there s a card on da...,0
5461,ham,ok i thk i got it then u wan me 2 come now or wat,0
4210,ham,i want kfc its tuesday only buy 2 meals only 2...,0
4216,ham,no dear i was sleeping p,0
1603,ham,ok pa nothing problem,0
1504,ham,ill be there on lt gt ok,0


In [20]:
# Split into training and test sets
training_test_index = round(len(df) * 0.8)

training = df[:training_test_index].reset_index(drop=True)
test = df[training_test_index:].reset_index(drop=True)

print('-- Training set stats --')
print(training.shape)
print(training['label_binary'].value_counts())
print('-- Test set stats --')
print(test.shape)
print(test['label_binary'].value_counts())

-- Training set stats --
(4458, 3)
0    3858
1     600
Name: label_binary, dtype: int64
-- Test set stats --
(1114, 3)
0    967
1    147
Name: label_binary, dtype: int64


In [21]:
### creating vocabulary from training data
training['sms_message'] = training['sms_message'].str.split()
vocabulary = []
for sms in training['sms_message']:
   for word in sms:
      vocabulary.append(word)
vocabulary = list(set(vocabulary))  ### only count the number of unique words
print(len(vocabulary))
vocabulary[0:9]

7780


['ls1',
 'art',
 'texted',
 'fresh',
 'patent',
 'compulsory',
 'somtimes',
 'ldew',
 'shagged']

In [22]:
word_counts_per_sms = {unique_word: [0] * len(training['sms_message']) for unique_word in vocabulary}

for index, sms in enumerate(training['sms_message']):
   for word in sms:
      word_counts_per_sms[word][index] += 1
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts

Unnamed: 0,ls1,art,texted,fresh,patent,compulsory,somtimes,ldew,shagged,serving,...,deleted,portions,red,wkent,9758,categories,parco,zhong,6th,violence
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4453,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4454,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4455,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4456,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
training_new = pd.concat([training, word_counts], axis=1)
training_new.head()

Unnamed: 0,label,sms_message,label_binary,ls1,art,texted,fresh,patent,compulsory,somtimes,...,deleted,portions,red,wkent,9758,categories,parco,zhong,6th,violence
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
# Run a baseline model evaluation
# Set all 'predicted to 0 or 1 randomly to get a baseline (coin-flip)
test['predicted'] = np.random.randint(0, 2, size=len(test))
test['predicted'].value_counts()

1    570
0    544
Name: predicted, dtype: int64

In [25]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: {}'.format(accuracy_score(test['label_binary'], test['predicted'])))
print('Precision score: {}'.format(precision_score(test['label_binary'], test['predicted'])))
print('Recall score: {}'.format(recall_score(test['label_binary'], test['predicted'])))
print('F1 score: {}'.format(f1_score(test['label_binary'], test['predicted'])))

Accuracy score: 0.5053859964093357
Precision score: 0.1456140350877193
Recall score: 0.564625850340136
F1 score: 0.23152022315202234


## **Your implementation starts here**.  Make sure your prediction result is saved into the column `test['predicted']` for the evaludation to run automatically.  
**50 points** for successful execution of your code and producing the confusion matrix correctly

In [26]:
# Laplace smoothing
alpha = 1

In [50]:
# Step 1: you need to caculate P(Spam) and P(Ham)
pSpam = training_new['label_binary'].mean()
pHam = 1 - pSpam

# Step 2: you need to count N_Spam, N_Ham
N_Spam = training_new['label_binary'].sum()
N_Ham = len(training_new) - N_Spam

# Step 3: you need to count the number of times the word w occurs in spam/ham message: N_w_spam, N_w_ham
N_w_spam = training_new.loc[training_new['label_binary'] == 1, vocabulary].sum()
N_w_ham = training_new.loc[training_new['label_binary'] == 0, vocabulary].sum()

# Step 4: then you can calculate the prob of occurance of each word:
#         p(w|spam)=(N_w_spam+alpha)/(N_Spam+alpha*N_Vocabulary)
#         p(w|Ham)=(N_w_ham+alpha)/(N_Ham+alpha*N_Vocabulary)
N_Vocbulary = len(vocabulary)
p_wSpam = (N_w_spam + alpha) / (N_Spam + alpha * N_Vocbulary)
p_wHam = (N_w_ham + alpha) / (N_Ham + alpha * N_Vocbulary)

# Step 5: Now perform the prediction on the test dataset messages using the Naiive Bayes method. Store your prediction results (1=spam or 0=ham ) to test['predicted']
def naiive_bayes(message):
    words = message.split()
    pSpam_curr = np.log(pSpam)
    pHam_curr = np.log(pHam)

    for word in words:
        if word in vocabulary:
            pHam_curr += np.log(p_wHam[word])
            pSpam_curr += np.log(p_wSpam[word])

    if pHam_curr > pSpam_curr:
        return 0 
    else:
        return 1
    
test['predicted'] = test['sms_message'].apply(naiive_bayes)

# Step 6: Summarize the results in a confusion matrix and print out the four values of the confusion matrix
#         Verify that your printout is consistent with the output from test['label_binary'].value_counts() and test['predicted'].value_counts()

confusionMatrix = pd.crosstab(test['label_binary'], test['predicted'], rownames=['Actual'], colnames=['Predicted'])
print(confusionMatrix)
print("\n")

TN = confusionMatrix.loc[0, 0]
FP = confusionMatrix.loc[0, 1]
FN = confusionMatrix.loc[1, 0]
TP = confusionMatrix.loc[1, 1]

print(f'True Negatve: {TN}')
print(f'False Positive: {FP}')
print(f'False Negatve: {FN}')
print(f'True Positive: {TP}')

Predicted    0    1
Actual             
0          966    1
1           40  107


True Negatve: 966
False Positive: 1
False Negatve: 40
True Positive: 107


**Evaluate your implementation** for accuracy, precision, recall and F1_score.  The performance points of your implementation will be calculated automatically.  However, it is only awarded if the predictions are made by a Naive Bayes implementation.

**30 points** for how well your implementation predicts spam.  A correct implementation should achieve an F1 score above 0.90.  
## **DO NOT modify this cell below.**

In [51]:
# Model Evaluation
print('Accuracy score: {}'.format(accuracy_score(test['label_binary'], test['predicted'])))
print('Precision score: {}'.format(precision_score(test['label_binary'], test['predicted'])))
print('Recall score: {}'.format(recall_score(test['label_binary'], test['predicted'])))
my_f1_score = f1_score(test['label_binary'], test['predicted'])
print('F1 score: {}'.format(my_f1_score))
performance_point = round(np.clip((my_f1_score - 0.20) / (0.9-0.20) * 30, 0, 30))
print('Your perforamnce point: {}'.format(performance_point))

Accuracy score: 0.9631956912028725
Precision score: 0.9907407407407407
Recall score: 0.7278911564625851
F1 score: 0.8392156862745099
Your perforamnce point: 27


**Analyze your implementation of the Naive Bayes algorithm:** select an entry from each quadrant of the confusion matrix and show the details of the prediction, i.e., the probability of being a spam or a ham, and all the contributing probabilities.  Discuss why mis-classification ocurrs for the FP and FN examples.

**20 points** for a correct and clear presentation.

In [39]:
TN_example = test[(test['label_binary'] == 0) & (test['predicted'] == 0)].iloc[0]
print("True Negative Example:")
print(TN_example['sms_message'])
print("Prediction:", TN_example['predicted'])
print("Actual:", TN_example['label_binary'])
print("\n")


FP_example = test[(test['label_binary'] == 0) & (test['predicted'] == 1)].iloc[0]
print("False Positive Example:")
print(FP_example['sms_message'])
print("Prediction:", FP_example['predicted'])
print("Actual:", FP_example['label_binary'])
print("\n")

FN_example = test[(test['label_binary'] == 1) & (test['predicted'] == 0)].iloc[0]
print("False Negative Example:")
print(FN_example['sms_message'])
print("Prediction:", FN_example['predicted'])
print("Actual:", FN_example['label_binary'])
print("\n")

TP_example = test[(test['label_binary'] == 1) & (test['predicted'] == 1)].iloc[0]
print("True Positive Example:")
print(TP_example['sms_message'])
print("Prediction:", TP_example['predicted'])
print("Actual:", TP_example['label_binary'])
print("\n")

True Negative Example:
later i guess i needa do mcat study too
Prediction: 0
Actual: 0


False Positive Example:
but i haf enuff space got like 4 mb
Prediction: 1
Actual: 0


False Negative Example:
urgent you have won a 1 week free membership in our 100 000 prize jackpot txt the word claim to no 81010 t c www dbuk net lccltd pobox 4403ldnw1a7rw18
Prediction: 0
Actual: 1


True Positive Example:
had your mobile 10 mths update to latest orange camera video phones for free save s with free texts weekend calls text yes for a callback orno to opt out
Prediction: 1
Actual: 1




**Your discussion goes here --** Since some words are more common in spam than "ham" messages, if there is a ham message taht has too many of those words, it will be more likely to flag as a spam message (False Positive). The opposite is true too, where if a message is more common in ham, spam might take advantage of those and use more of those words so that it shows up as ham (False Negative).