# Email Spam 

In [None]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib as mpl

In [12]:
# Dataset from - https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection
df = pd.read_table('SMSSpamCollection',
                   sep='\t', 
                   header=None, 
                   names=['label', 'sms_message'])

# Output printing out first 5 columns
df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
df['label'] = df.label.map({'ham':0, 'spam':1})
print(df.shape)
df.head(10) # returns (rows, columns)

(5572, 2)


Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
5,1,FreeMsg Hey there darling it's been 3 week's n...
6,0,Even my brother is not like to speak with me. ...
7,0,As per your request 'Melle Melle (Oru Minnamin...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...


In [2]:
documents = ['Hello, how are you!',
             'Win money, win from home.',
             'Call me now.',
             'Hello, Call hello you tomorrow?']

# Convert all strings to their lower case form.
lower_case_documents = []
for i in documents:
    lower_case_documents.append(i.lower())
print(lower_case_documents)

['hello, how are you!', 'win money, win from home.', 'call me now.', 'hello, call hello you tomorrow?']


In [3]:
# Removing all punctuations
sans_punctuation_documents = []
import string

for i in lower_case_documents:
    sans_punctuation_documents.append(i.translate(str.maketrans('', '', string.punctuation)))
print(sans_punctuation_documents)

AttributeError: type object 'str' has no attribute 'maketrans'

In [8]:
# Tokenization
preprocessed_documents = []
for i in sans_punctuation_documents:
    preprocessed_documents.append(i.split(' '))
print(preprocessed_documents)

[]


In [7]:
# Count frequencies
frequency_list = []
import pprint
from collections import Counter

for i in preprocessed_documents:
    frequency_counts = Counter(i)
    #frequency_counts = Counter(i)
    frequency_list.append(frequency_counts)
    #frequency_list.append(frequency_counts)
pprint.pprint(frequency_list)
#pprint.pprint(frequency_list)

[Counter({'are': 1, 'hello': 1, 'how': 1, 'you': 1}),
 Counter({'win': 2, 'from': 1, 'money': 1, 'home': 1}),
 Counter({'me': 1, 'call': 1, 'now': 1}),
 Counter({'hello': 2, 'you': 1, 'call': 1, 'tomorrow': 1})]


# Implementing Bag of Words in scikit-learn

In [18]:
# call CountVectorizer()
from sklearn.feature_extraction.text import CountVectorizer
    #from sklearn.feature_extraction.text import CountVetorizer
count_vector = CountVectorizer()
    #count_vector = CountVectorizer()
print(count_vector)

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


In [19]:
# apply count_vector to documents
count_vector.fit(documents)
count_vector.get_feature_names()

[u'are',
 u'call',
 u'from',
 u'hello',
 u'home',
 u'how',
 u'me',
 u'money',
 u'now',
 u'tomorrow',
 u'win',
 u'you']

In [20]:
doc_array = count_vector.transform(documents).toarray()
    # doc_array = count_vector.transform(documents).toarray()
doc_array

array([[1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 2, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
       [0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 1]], dtype=int64)

In [21]:
frequency_matrix = pd.DataFrame(doc_array, columns = count_vector.get_feature_names())
    # freq_mx = pd.DataFrame(doc_array), columns = count_vector
frequency_matrix

Unnamed: 0,are,call,from,hello,home,how,me,money,now,tomorrow,win,you
0,1,0,0,1,0,1,0,0,0,0,0,1
1,0,0,1,0,1,0,0,1,0,0,2,0
2,0,1,0,0,0,0,1,0,1,0,0,0
3,0,1,0,2,0,0,0,0,0,1,0,1


# split into training and testing sets

In [44]:
# USE from sklearn.model_selection import train_test_split to avoid seeing deprecation warning.
    # X_train is our training data for the 'sms_message' column.
    # y_train is our training data for the 'label' column
    # X_test is our testing data for the 'sms_message' column.
    # y_test is our testing data for the 'label' column Print out the number of rows we have in each our training and testing data.
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['sms_message'], 
                                                    df['label'], 
                                                    random_state=1)

print('Number of rows in the total set: {}'.format(df.shape[0]))
    # print('xxx':{}.format(df.shape[0]))
print('training rows: {}'.format(X_train.shape[0]))
print('testing rows: {}'.format(X_test.shape[0]))
print('training rows: {}'.format(y_train.shape[0]))
print('testing rows: {}'.format(y_test.shape[0]))

Number of rows in the total set: 5572
training rows: 4179
testing rows: 1393
training rows: 4179
testing rows: 1393


# Bayes Theorem

In [26]:
# P(D)
p_diabetes = 0.01

# P(~D)
p_no_diabetes = 0.99

# Sensitivity or P(Pos|D)
p_pos_diabetes = 0.9

# Specificity or P(Neg/~D)
p_neg_no_diabetes = 0.9

# P(Pos)
p_pos = (p_diabetes * p_pos_diabetes) + (p_no_diabetes * (1 - p_neg_no_diabetes))
print(format(p_pos))

0.108


In [27]:
# P(D|Pos)
p_diabetes_pos = (p_diabetes * p_pos_diabetes) / p_pos
print(format(p_diabetes_pos))

0.0833333333333


In [28]:
# P(Pos/~D)
p_pos_no_diabetes = 0.1

# P(~D|Pos)
p_no_diabetes_pos = (p_no_diabetes * p_pos_no_diabetes) / p_pos
print(format(p_no_diabetes_pos))

0.916666666667


# Naive Bayes
    ### P(J|F,I): Probability of Jill Stein saying the words Freedom and Immigration.
    ### Using the formula and our knowledge of Bayes' theorem, we can compute this as follows: P(J|F,I) = (P(J) * P(F|J) * P(I|J)) / P(F,I). Here P(F,I) is the probability of the words 'freedom' and 'immigration' being said in a speech.
    ### P(G|F,I): Probability of Gary Johnson saying the words Freedom and Immigration.
    ### Using the formula, we can compute this as follows: P(G|F,I) = (P(G) * P(F|G) * P(I|G)) / P(F,I)

In [29]:
# P(J)
p_j = 0.5

# P(F/J)
p_j_f = 0.1

# P(I/J)
p_j_i = 0.1

p_j_text = p_j * p_j_f * p_j_i
print(p_j_text)

0.005


In [30]:
# P(G)
p_g = 0.5

# P(F/G)
p_g_f = 0.7

# P(I/G)
p_g_i = 0.2

p_g_text = p_g * p_g_f * p_g_i
print(p_g_text)

0.07


In [31]:
p_f_i = p_j_text + p_g_text
print('Probability of words freedom and immigration being said are: ', format(p_f_i))

('Probability of words freedom and immigration being said are: ', '0.075')


In [32]:
p_j_fi = p_j_text / p_f_i
print('The probability of Jill Stein saying the words Freedom and Immigration: ', format(p_j_fi))

('The probability of Jill Stein saying the words Freedom and Immigration: ', '0.0666666666667')


In [33]:
p_g_fi = p_g_text / p_f_i
print('The probability of Gary Johnson saying the words Freedom and Immigration: ', format(p_g_fi))

('The probability of Gary Johnson saying the words Freedom and Immigration: ', '0.933333333333')


# Naive Bayes implementation using scikit-learn

In [45]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [46]:
# predict training model
predictions = naive_bayes.predict(testing_data)

In [47]:
# Compute the accuracy, precision, recall and F1 scores of your model using your test data 'y_test' and the predictions
# you made earlier stored in the 'predictions' variable.
    # true/false = not spam/spam
    # positive/negative = correct/incorrct 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    #  number of correct predictions / total number of predictions
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
    # [True Positives/(True Positives + False Positives)]
print('Precision score: ', format(precision_score(y_test, predictions)))
    # [True Positives/(True Positives + False Negatives)]
print('Recall score: ', format(recall_score(y_test, predictions)))
    # precision and recall, two metrics can be combined to get the F1 score
print('F1 score: ', format(f1_score(y_test, predictions)))

('Accuracy score: ', '0.988513998564')
('Precision score: ', '0.972067039106')
('Recall score: ', '0.940540540541')
('F1 score: ', '0.956043956044')
