# Classification of messages as spam or not spam using Naive Bayes algorithm 

In [1]:
import pandas as pd

In [2]:
# Import Dataset
df = pd.read_table('SMS', sep='\t', header=None,names=['label', 'sms_message'])
df

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
# map the 'ham' value to 0 and the 'spam' value to 1.
df['label'] = df.label.map({'ham':0,'spam':1})

In [4]:
# Randomize the dataset
df = df.sample(frac=1, random_state=1)
df

Unnamed: 0,label,sms_message
1078,0,"Yep, by the pretty sculpture"
4028,0,"Yes, princess. Are you going to make me moan?"
958,0,Welp apparently he retired
4642,0,Havent.
4674,0,I forgot 2 ask ü all smth.. There's a card on ...
...,...,...
905,0,"We're all getting worried over here, derek and..."
5192,0,Oh oh... Den muz change plan liao... Go back h...
3980,0,CERI U REBEL! SWEET DREAMZ ME LITTLE BUDDY!! C...
235,1,Text & meet someone sexy today. U can find a d...


In [5]:
# Split into training and test sets
training_test_index = round(len(df) * 0.8)

training = df[:training_test_index].reset_index(drop=True)
test = df[training_test_index:].reset_index(drop=True)

print(training.shape)
print(test.shape)

(4458, 2)
(1114, 2)


In [6]:
#  data cleaning 
training['sms_message'] = training['sms_message'].str.replace('\W', ' ') # Removes punctuation
training['sms_message'] = training['sms_message'].str.lower() ### making all the words lowercase

test['sms_message'] = test['sms_message'].str.replace('\W', ' ') # Removes punctuation
test['sms_message'] = test['sms_message'].str.lower() ### making all the words lowercase
training

  training['sms_message'] = training['sms_message'].str.replace('\W', ' ') # Removes punctuation
  test['sms_message'] = test['sms_message'].str.replace('\W', ' ') # Removes punctuation


Unnamed: 0,label,sms_message
0,0,yep by the pretty sculpture
1,0,yes princess are you going to make me moan
2,0,welp apparently he retired
3,0,havent
4,0,i forgot 2 ask ü all smth there s a card on ...
...,...,...
4453,0,sorry i ll call later in meeting any thing re...
4454,0,babe i fucking love you too you know fuck...
4455,1,u ve been selected to stay in 1 of 250 top bri...
4456,0,hello my boytoy geeee i miss you already a...


In [7]:
### creating vocabulary from training data
training['sms_message'] = training['sms_message'].str.split()
vocabulary = []
for sms in training['sms_message']:
   for word in sms:
      vocabulary.append(word)
vocabulary = list(set(vocabulary))  ### only count the number of unique words
print(len(vocabulary))

7783


In [8]:
word_counts_per_sms = {unique_word: [0] * len(training['sms_message']) for unique_word in vocabulary}

for index, sms in enumerate(training['sms_message']):
   for word in sms:
      word_counts_per_sms[word][index] += 1
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts

Unnamed: 0,women,ditto,maybe,southern,tobacco,mel,evry1,casualty,pshew,offered,...,en,kfc,played,simonwatson5120,med,84025,excellent,promise,breathe1,reaching
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4453,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4454,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4455,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4456,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
training_new = pd.concat([training, word_counts], axis=1)
training_new.head()

Unnamed: 0,label,sms_message,women,ditto,maybe,southern,tobacco,mel,evry1,casualty,...,en,kfc,played,simonwatson5120,med,84025,excellent,promise,breathe1,reaching
0,0,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Laplace smoothing
alpha = 1

# You will start from here.

In [61]:
# Hints:
# Step 1: you need to caculate P(Spam) and P(Ham)
# Step 2: you need to count N_Spam, N_Ham
# Step 3: you need to count the number of times the word w occurs in spam/ham message: N_w_spam
# Step 4: then you can calculate the prob of occurance of each word: p(w|spam)=(N_w_spam+alpha)/(N_Spam+alpha*N_Vocabulary)
#         p(w|Ham)=(N_w_ham+alpha)/(N_Ham+alpha*N_Vocabulary)
# Step 5: Now do the prediction if it is spam or ham for each sentence, then store your prediction results (1 or 0 ) to test['predicted']

# Notes: for the symbols listed above, please check the last page of lecture. 
#        

# Calculate accuracy, precision, recall and F1_score. 

In [None]:
# Model Evaluation 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print('Accuracy score: {}'.format(accuracy_score(test['label'], test['predicted'])))
print('Precision score: {}'.format(precision_score(test['label'], test['predicted'])))
print('Recall score: {}'.format(recall_score(test['label'], test['predicted'])))
print('F1 score: {}'.format(f1_score(test['label'], test['predicted'])))

### Now  use function MultinomialNB (from sklearn.naive_bayes import MultinomialNB) to validate your results, check the accuracy