In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']

# Enumerate our spammy keywords.
keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent']

for key in keywords:
    sms_raw[str(key)] = sms_raw.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
)

sms_raw['allcaps'] = sms_raw.message.str.isupper()
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']

from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
y_pred = bnb.fit(data, target).predict(data)

In [3]:
# Calculate the accuracy of your model here.
correct = (target == y_pred).sum()
accuracy = correct/sms_raw.count()
print(accuracy)

spam       0.891601
message    0.891601
click      0.891601
offer      0.891601
winner     0.891601
buy        0.891601
free       0.891601
cash       0.891601
urgent     0.891601
allcaps    0.891601
dtype: float64


In [4]:
from sklearn.metrics import confusion_matrix
confusion_matrix(target, y_pred)

array([[4770,   55],
       [ 549,  198]], dtype=int64)

In [5]:
#Find counts for confusion matrix
sms_raw['y_pred'] = y_pred

true_positive = len(sms_raw[(sms_raw['spam'] == False) & (sms_raw['y_pred'] == False)])
print(true_positive)

false_positive = len(sms_raw[(sms_raw['spam'] == False) & (sms_raw['y_pred'] == True)]) #Type I Error
print(false_positive)

true_negative = len(sms_raw[(sms_raw['spam'] == True) & (sms_raw['y_pred'] == True)])
print(true_negative)

false_negative = len(sms_raw[(sms_raw['spam'] == True) & (sms_raw['y_pred'] == False)]) #Type II Error
print(false_negative)

4770
55
198
549


In [6]:
#Sensitivity
print(true_negative/(false_negative + true_negative))

#Specificity
print(true_positive/(false_positive + true_positive))

0.26506024096385544
0.9886010362694301
