## Analyzing SMS Messages to Create a Classification system



In [1]:
import pandas as pd

sms = pd.read_csv("SMSSpamCollection", sep='\t', header=None, names=['Label','SMS'])


In [2]:
sms.shape

(5572, 2)

In [3]:
count = sms.shape[0]

In [4]:
sms.head(5)

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
sms['Label'].value_counts().apply(lambda x: "{:.2f}%".format(round(x/count*100,2)))

ham     86.59%
spam    13.41%
Name: Label, dtype: object

In [6]:
dataset = sms.sample(frac=1, random_state=1)

In [7]:
print(round(.8 * 5572), "80% Training")
print(round(.2 * 5572), "20% Testing")

4458 80% Training
1114 20% Testing


In [8]:
training_set = dataset.iloc[:4458, :]
test_set = dataset.iloc[4458:, :]

In [9]:
training_set['Label'].value_counts().apply(lambda x: "{:.2f}%".format(round(x/4458*100,2)))

ham     86.54%
spam    13.46%
Name: Label, dtype: object

In [10]:
test_set['Label'].value_counts().apply(lambda x: "{:.2f}%".format(round(x/1114*100,2)))

ham     86.80%
spam    13.20%
Name: Label, dtype: object

In [11]:
training_set.reset_index(inplace=True)

In [12]:
test_set.reset_index(inplace=True)

In [13]:
training_set['SMS'] = training_set['SMS'].str.replace(r'\W',' ').str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [14]:
training_set.head()

Unnamed: 0,index,Label,SMS
0,1078,ham,yep by the pretty sculpture
1,4028,ham,yes princess are you going to make me moan
2,958,ham,welp apparently he retired
3,4642,ham,havent
4,4674,ham,i forgot 2 ask ü all smth there s a card on ...


In [15]:
vocabulary = []

In [16]:
training_set["SMS"] = training_set["SMS"].str.split()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [17]:
for l in training_set["SMS"]:
    for w in l:
        vocabulary.append(w)

In [18]:
vocabulary = set(vocabulary)

In [19]:
vocabulary = list(vocabulary)

In [20]:
word_counts_per_sms = {unique_word: [0] * len(training_set['SMS']) for unique_word in vocabulary}

In [21]:
for index, sms in enumerate(training_set['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] += 1

In [22]:
word_counts_df = pd.DataFrame(word_counts_per_sms)

In [23]:
word_counts_df.shape

(4458, 7783)

In [24]:
new_df = pd.concat([training_set, word_counts_df],axis=1)

In [31]:
p_spam = len(new_df[new_df['Label']=='spam'])/len(new_df)
p_spam

0.13458950201884254

In [33]:
p_ham = len(new_df[new_df['Label']=='ham'])/len(new_df)
p_ham

0.8654104979811574

In [39]:
n_spam = 0
for row in new_df[new_df['Label']=='spam']['SMS']:
    n_spam += len(row)
print(n_spam)

15190


In [40]:
n_ham = 0
for row in new_df[new_df['Label']=='ham']['SMS']:
    n_ham += len(row)
print(n_ham)

57237


In [41]:
n_vocabulary = len(vocabulary)

In [42]:
alpha = 1

In [43]:
n_vocabulary

7783

In [44]:
p_w_spam = { unique_word:0 for unique_word in vocabulary }
p_w_ham = { unique_word:0 for unique_word in vocabulary }

In [45]:
spam_set = new_df[new_df['Label']=='spam'].copy()

In [47]:
spam_set.shape

(600, 7786)

In [48]:
ham_set = new_df[new_df['Label']=='ham'].copy()

In [49]:
ham_set.shape

(3858, 7786)

In [51]:
for word in vocabulary:
    p_w_spam[word] = (spam_set[word].sum()+alpha)/(n_spam + alpha * n_vocabulary)
    p_w_ham[word] = (ham_set[word].sum()+alpha)/(n_ham + alpha * n_vocabulary)


In [56]:
import re

def classify(message):

    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()

    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    for word in message:
        if word in p_w_spam:
            p_spam_given_message *= p_w_spam[word]
        if word in p_w_ham:
            p_ham_given_message *= p_w_ham[word]

    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)

    if p_ham_given_message > p_spam_given_message:
        print('Label: Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam')
    else:
        print('Equal proabilities, have a human classify this!')

In [57]:
classify("WINNER!! This is the secret code to unlock the money: C3421.")

P(Spam|message): 1.3481290211300841e-25
P(Ham|message): 1.9368049028589875e-27
Label: Spam


In [58]:
classify("Sounds good, Tom, then see u there")

P(Spam|message): 2.4372375665888117e-25
P(Ham|message): 3.687530435009238e-21
Label: Ham


In [63]:
def classify_test_set(message):

    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()

    p_spam_given_message = p_spam
    p_ham_given_message = p_ham

    for word in message:
        if word in p_w_spam:
            p_spam_given_message *= p_w_spam[word]

        if word in p_w_ham:
            p_ham_given_message *= p_w_ham[word]

    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_spam_given_message > p_ham_given_message:
        return 'spam'
    else:
        return 'needs human classification'

In [64]:
test_set['predicted'] = test_set['SMS'].apply(classify_test_set)
test_set.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0,index,Label,SMS,predicted
0,2131,ham,Later i guess. I needa do mcat study too.,ham
1,3418,ham,But i haf enuff space got like 4 mb...,ham
2,3424,spam,Had your mobile 10 mths? Update to latest Oran...,spam
3,1538,ham,All sounds good. Fingers . Makes it difficult ...,ham
4,5393,ham,"All done, all handed in. Don't know if mega sh...",ham


In [71]:
correct = 0
total = len(test_set)
for i, row in test_set.iterrows():
    if row['Label'] == row['predicted']:
        correct += 1
print('Accuracy: {}%'.format(round(correct/total*100,2)))

Accuracy: 98.74%
