In [20]:
import pandas as pd
import numpy as np

## Spam Detector

In [11]:
path = 'sms.csv'
data = pd.read_csv(path, sep='\t', header=None, names=['label', 'message']) # use sep, header, and names to format data 
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [43]:
hamCount, spamCount = data.label.value_counts() # get counts of ham and spam

4825

In [58]:
random = np.random.randint(spamCount)
random
data[data.label == 'spam'].iloc[random].message # look at a random spam message

'Would you like to see my XXX pics they are so hot they were nearly banned in the uk!'

In [102]:
random = np.random.randint(hamCount)
random
data[data.label == 'ham'].iloc[random].message # look at a random ham message

'i thought we were doing a king of the hill thing there.'

## Naive Bayes
- Based on Bayes theorem

### Setting
- labels, 1,2,k. In this case 1=ham, 2=spam
- data: n documents
- vocabulary: w11,w21,...wd
- For each document: x = [n1,n2,n3,...nd) 'count vector'
- nj = number of times wi appears

In [103]:
# p(label=ham)
hamCount / (hamCount+spamCount)

0.8659368269921034

In [104]:
# p(label=spam)
spamCount / (hamCount+spamCount)

0.13406317300789664

### Prediction rule:
- predict the label with the largest probability
- p(label=1 | x)
- p(label=2 | x)
- ...
- p(label=k | x)

In [107]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix

In [109]:
pipe = Pipeline(steps=[
    ('vect', CountVectorizer(max_features=1000)),
    ('clf', MultinomialNB()) # can take alpha parameter
])

In [111]:
X = data.message
y = data.label

In [113]:
# split data
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [114]:
# fit pipe
pipe.fit(X_train,y_train)


In [115]:
# evaluate the pipeline
y_test_pred = pipe.predict(X_test)

In [116]:
confusion_matrix(y_test,y_test_pred)

array([[1206,   16],
       [  10,  161]], dtype=int64)

## How does Naive Bayes choose between ham and spam?