## Practice Project: Using Naive Bayes to detect spam
https://github.com/udacity/machine-learning/blob/master/projects/practice_projects/naive_bayes_tutorial/Naive_Bayes_tutorial.ipynb

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('SMSSpamCollection/SMSSpamCollection', header=None, sep='\t', names=['label', 'text'])
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.label = df.label.map({'ham':0, 'spam':1})
print df.shape
df.head()

(5572, 2)


Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


### Implementing Bag of Words from scratch

In [4]:
documents = ['Hello, how are you!',
             'Win money, win from home.',
             'Call me now.',
             'Hello, Call hello you tomorrow?']

lower_case_documents = []
for i in documents:
    lower_case_documents.append(i.lower())
print(lower_case_documents)

['hello, how are you!', 'win money, win from home.', 'call me now.', 'hello, call hello you tomorrow?']


In [5]:
sans_punctuation_documents = []
import string

for i in lower_case_documents:
    sans_punctuation_documents.append(i.translate(None, string.punctuation))
print(sans_punctuation_documents)

['hello how are you', 'win money win from home', 'call me now', 'hello call hello you tomorrow']


In [6]:
preprocessed_documents = []
for i in sans_punctuation_documents:
    preprocessed_documents.append(i.split(' '))
print(preprocessed_documents)

[['hello', 'how', 'are', 'you'], ['win', 'money', 'win', 'from', 'home'], ['call', 'me', 'now'], ['hello', 'call', 'hello', 'you', 'tomorrow']]


In [7]:
from collections import Counter

frequency_list = []
for i in preprocessed_documents:
    frequency_counts = Counter(i)
    frequency_list.append(frequency_counts)

print frequency_list, '\n'

import pprint
pprint.pprint(frequency_list)

[Counter({'how': 1, 'you': 1, 'hello': 1, 'are': 1}), Counter({'win': 2, 'home': 1, 'from': 1, 'money': 1}), Counter({'me': 1, 'now': 1, 'call': 1}), Counter({'hello': 2, 'you': 1, 'call': 1, 'tomorrow': 1})] 

[Counter({'how': 1, 'you': 1, 'hello': 1, 'are': 1}),
 Counter({'win': 2, 'home': 1, 'from': 1, 'money': 1}),
 Counter({'me': 1, 'now': 1, 'call': 1}),
 Counter({'hello': 2, 'you': 1, 'call': 1, 'tomorrow': 1})]


### Implementing Bag of Words in scikit-learn

In [8]:
documents = ['Hello, how are you!',
             'Win money, win from home.',
             'Call me now.',
             'Hello, Call hello you tomorrow?']

from sklearn.feature_extraction.text import CountVectorizer

In [9]:
count_vec = CountVectorizer()

In [10]:
count_vec

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [11]:
count_vec.fit(documents)

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [12]:
count_vec.get_feature_names()

[u'are',
 u'call',
 u'from',
 u'hello',
 u'home',
 u'how',
 u'me',
 u'money',
 u'now',
 u'tomorrow',
 u'win',
 u'you']

In [13]:
print count_vec.transform(documents)

  (0, 0)	1
  (0, 3)	1
  (0, 5)	1
  (0, 11)	1
  (1, 2)	1
  (1, 4)	1
  (1, 7)	1
  (1, 10)	2
  (2, 1)	1
  (2, 6)	1
  (2, 8)	1
  (3, 1)	1
  (3, 3)	2
  (3, 9)	1
  (3, 11)	1


In [14]:
doc_array = count_vec.transform(documents).toarray()
print doc_array

[[1 0 0 1 0 1 0 0 0 0 0 1]
 [0 0 1 0 1 0 0 1 0 0 2 0]
 [0 1 0 0 0 0 1 0 1 0 0 0]
 [0 1 0 2 0 0 0 0 0 1 0 1]]


In [15]:
frequency_matrix = pd.DataFrame(doc_array, 
                                columns = count_vec.get_feature_names())
frequency_matrix

Unnamed: 0,are,call,from,hello,home,how,me,money,now,tomorrow,win,you
0,1,0,0,1,0,1,0,0,0,0,0,1
1,0,0,1,0,1,0,0,1,0,0,2,0
2,0,1,0,0,0,0,1,0,1,0,0,0
3,0,1,0,2,0,0,0,0,0,1,0,1


### Training and testing sets

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.text, df.label, random_state=1)

print X_train.shape, X_test.shape, y_train.shape, y_test.shape

(4179L,) (1393L,) (4179L,) (1393L,)


In [17]:
count_vec = CountVectorizer()
train = count_vec.fit_transform(X_train)
test = count_vec.transform(X_test)

### Bayes Theorem implementation from scratch

In [18]:
# probability of a person having Diabetes
P_D = 0.01

# probability of getting a positive result on a test done for detecting diabetes, given that you have diabetes
P_Pos_D = 0.9

# probability of getting a negative result on a test done for detecting diabetes, given that you do not have diabetes
P_Neg_nD = 0.9

# what is P_D_Pos?

In [19]:
# P_Pos = P_D * P_Pos_D + P_nD * P_Pos_nD

P_Pos = 0.01 * 0.9 + (1 - 0.01) * (1 - 0.9)

P_Pos

0.10799999999999998

In [20]:
# P_D_Pos = P_D * P_Pos_D / P_Pos

P_D_Pos = 0.01 * 0.9 / P_Pos

# the probability of an individual having diabetes, given that, that individual got a positive test result
P_D_Pos

0.08333333333333336

In [21]:
# P_nD_Pos = P_nD * P_Pos_nD / P_Pos

P_nD_Pos = (1 - 0.01) * (1 - 0.9) / P_Pos

# Probability of an individual not having diabetes, given that that individual got a positive test result
P_nD_Pos

0.9166666666666666

### Naive Bayes implementation using scikit-learn

In [22]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [23]:
pred = model.predict(test)

In [24]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [25]:
print confusion_matrix(y_test, pred)

[[1203    5]
 [  11  174]]


In [26]:
print accuracy_score(y_test, pred)
print precision_score(y_test, pred)
print recall_score(y_test, pred)
print f1_score(y_test, pred)

0.988513998564
0.972067039106
0.940540540541
0.956043956044


In [27]:
y_test.head()

1078    0
4028    0
958     0
4642    0
4674    0
Name: label, dtype: int64

In [28]:
pred

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

One of the major advantages that Naive Bayes has over other classification algorithms is its ability to handle an extremely large number of features. In our case, each word is treated as a feature and there are thousands of different words. Also, it performs well even with the presence of irrelevant features and is relatively unaffected by them. The other major advantage it has is its relative simplicity. Naive Bayes' works well right out of the box and tuning it's parameters is rarely ever necessary, except usually in cases where the distribution of the data is known. It rarely ever overfits the data. Another important advantage is that its model training and prediction times are very fast for the amount of data it can handle. All in all, Naive Bayes' really is a gem of an algorithm!