In [80]:
import numpy as np

In [81]:
import pandas as pd

In [82]:
import matplotlib.pyplot as plt

In [83]:
from sklearn.feature_extraction.text import CountVectorizer

In [84]:
from sklearn.model_selection import train_test_split

In [85]:
from sklearn.naive_bayes import GaussianNB

In [86]:
import string

In [87]:
from __future__ import print_function

In [88]:
df = pd.read_table("smsspamcollection/SMSSpamCollection", sep="\t", header=None, names=['label','msg'])

In [89]:
df.head()

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [90]:
df['label'] = df.label.map({'ham':0, 'spam':1})

In [91]:
df.head()

Unnamed: 0,label,msg
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [92]:
y = df['label']

In [93]:
X = df['msg']

In [94]:
lower_case = []
for i in y:
    lower_case.append(i.lower())
sans_punctuation = []
for i in lower_case:
    sans_punctuation.append(i.translate(string.maketrans("", ""), string.punctuation))
sans_punctuation

AttributeError: 'numpy.int64' object has no attribute 'lower'

In [95]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [96]:
preprocessed = []
for i in sans_punctuation:
    preprocessed.append(i.split(' '))
preprocessed

[['go',
  'until',
  'jurong',
  'point',
  'crazy',
  'available',
  'only',
  'in',
  'bugis',
  'n',
  'great',
  'world',
  'la',
  'e',
  'buffet',
  'cine',
  'there',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'u', 'oni'],
 ['free',
  'entry',
  'in',
  '2',
  'a',
  'wkly',
  'comp',
  'to',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  '21st',
  'may',
  '2005',
  'text',
  'fa',
  'to',
  '87121',
  'to',
  'receive',
  'entry',
  'questionstd',
  'txt',
  'ratetcs',
  'apply',
  '08452810075over18s'],
 ['u', 'dun', 'say', 'so', 'early', 'hor', 'u', 'c', 'already', 'then', 'say'],
 ['nah',
  'i',
  'dont',
  'think',
  'he',
  'goes',
  'to',
  'usf',
  'he',
  'lives',
  'around',
  'here',
  'though'],
 ['freemsg',
  'hey',
  'there',
  'darling',
  'its',
  'been',
  '3',
  'weeks',
  'now',
  'and',
  'no',
  'word',
  'back',
  'id',
  'like',
  'some',
  'fun',
  'you',
  'up',
  'for',
  'it',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',

In [97]:
frequently = []
import pprint
from collections import Counter

for i in preprocessed:
    frequently_count = Counter(i)
    frequently.append(frequently_count)
pprint.pprint(frequently)

[Counter({'available': 1, 'crazy': 1, 'cine': 1, 'e': 1, 'jurong': 1, 'got': 1, 'point': 1, 'there': 1, 'la': 1, 'great': 1, 'buffet': 1, 'bugis': 1, 'wat': 1, 'only': 1, 'in': 1, 'go': 1, 'world': 1, 'n': 1, 'until': 1, 'amore': 1}),
 Counter({'wif': 1, 'oni': 1, 'ok': 1, 'joking': 1, 'u': 1, 'lar': 1}),
 Counter({'to': 3, 'fa': 2, 'entry': 2, 'tkts': 1, 'text': 1, 'questionstd': 1, '87121': 1, 'in': 1, 'apply': 1, 'txt': 1, '21st': 1, 'cup': 1, '2': 1, 'win': 1, 'final': 1, 'may': 1, 'comp': 1, 'ratetcs': 1, 'free': 1, 'a': 1, 'receive': 1, '08452810075over18s': 1, 'wkly': 1, '2005': 1}),
 Counter({'say': 2, 'u': 2, 'then': 1, 'c': 1, 'already': 1, 'dun': 1, 'early': 1, 'so': 1, 'hor': 1}),
 Counter({'he': 2, 'usf': 1, 'dont': 1, 'though': 1, 'i': 1, 'nah': 1, 'around': 1, 'to': 1, 'lives': 1, 'here': 1, 'goes': 1, 'think': 1}),
 Counter({'to': 2, 'and': 1, 'some': 1, 'it': 1, 'send': 1, 'still': 1, 'id': 1, 'freemsg': 1, 'chgs': 1, 'for': 1, 'no': 1, 'rcv': 1, 'there': 1, 'hey': 1, 

 Counter({'flippin': 1, 'you': 1, 'your': 1, 'shit': 1, 'yet': 1}),
 Counter({'a': 2, '': 2, 'me': 1, 'give': 1, 'k': 1, 'breaking': 1, 'cstore': 1, 'sec': 1, 'ltgt': 1, 'at': 1}),
 Counter({'this': 1, 'like': 1, 'that': 1, 'i': 1, 'avoid': 1, 'am': 1, 'to': 1, 'bad': 1, 'much': 1}),
 Counter({'around': 1, 'just': 1, 'car': 1, 'yo': 1, 'back': 1, 'got': 1, 'you': 1, 'my': 1}),
 Counter({'isnt': 1, 'annoying': 1, 'it': 1}),
 Counter({'': 1, 'for': 1, 'min': 1, 'i': 1, 'am': 1, 'late': 1, 'goodmorning': 1, 'today': 1, 'ltgt': 1}),
 Counter({'not': 2, 'on': 1, 'right': 1, 'point': 1, 'happy': 1, 'no': 1, 'makin': 1, 'to': 1, 'if': 1, 'mr': 1, 'hes': 1, 'hangin': 1, 'u': 1, 'theres': 1}),
 Counter({'looking': 1, 'all': 1, 'good': 1, 'figure': 1, 'there': 1, 'alivebetter': 1, 'correct': 1, 'will': 1, 'itself': 1, 'come': 1, 'any': 1}),
 Counter({'case': 1, 'guess': 1, 'that': 1, 'i': 1, 'ill': 1, 'see': 1, 'at': 1, 'in': 1, 'lodge': 1, 'you': 1, 'campus': 1}),
 Counter({'done': 1, 'were': 1

 Counter({'call': 1, 'ill': 1, 'later': 1, 'meeting': 1, 'sorryin': 1}),
 Counter({'to': 2, 'me': 1, 'all': 1, 'hes': 1, 'thinking': 1, 'stop': 1, 'runs': 1, 'your': 1, 'going': 1, 'im': 1, 'have': 1, 'plus': 1, 'you': 1, 'didnt': 1, 'tell': 1, 'thatnow': 1}),
 Counter({'': 2, 'flat': 1, 'kindly': 1, 'some': 1, 'send': 1, 'one': 1, 'to': 1, 'our': 1, 'ltdecimalgt': 1, 'today': 1, 'before': 1}),
 Counter({'help': 1, 'themob': 1, 'yet': 1, 'min': 1, 'resubmit': 1, '4': 1, 'reply': 1, 'weeks': 1, 'has': 1, 'more': 1, 'info': 1, 'pls': 1, 'offer': 1, 'after': 1, 'expiry': 1, 'unsubscribe': 1, 'not': 1, 'mob': 1, 'sorry': 1, 'a': 1, 'term': 1, 'package': 1, 'of': 1, 'request': 1, 'u': 1, 'can': 1, '54': 1, 'the': 1}),
 Counter({'a': 1, 'then': 1, 'today': 1, 'early': 1, 'dun': 1, 'lor': 1, 'y': 1, '2': 1, 'u': 1, 'too': 1, 'go': 1, 'nothing': 1, 'home': 1, 'bit': 1, 'bored': 1, 'sleep': 1}),
 Counter({'be': 1, 'what': 1, 'around': 1, 'i': 1, 'should': 1, 'to': 1, 'time': 1, 'my': 1, 'tell':

 Counter({'doing': 1, 'u': 1, 'what': 1, 'so': 1, 'today': 1}),
 Counter({'i': 1, 'sorry': 1, 'said': 1, 'okay': 1, 'its': 1}),
 Counter({'': 2, 'i': 1, 'dangerous': 1, 'is': 1, 'slept': 1, 'thinkthis': 1, 'ltgt': 1, 'time': 1, 'not': 1, 'pm': 1}),
 Counter({'job': 1, 'there': 1, 'networking': 1, 'is': 1}),
 Counter({'stop': 2, 'to': 2, 'be': 1, '29m': 1, 'see': 1, 'inviting': 1, '62468': 1, 'is': 1, 'yes762': 1, 'him': 1, 'no762': 1, 'goldviking': 1, 'his': 1, 'send': 1, 'frnd': 1, 'wwwsmsacugoldviking': 1, 'reply': 1, 'you': 1, 'or': 1, 'friend': 1}),
 Counter({'stress': 1, 'dont': 1, 'studying': 1, 'l8r': 1, 'let': 1, 'you': 1, 'out': 1}),
 Counter({'me': 1, 'busy': 1, 'haf': 1, 'keep': 1, '2': 1, 'u': 1, 'y': 1, 'thats': 1}),
 Counter({'we': 2, 'im': 2, 'school': 1, 'working': 1, 'no': 1, 'hungry': 1, 'go': 1, 'rush': 1, 'in': 1, 'not': 1, 'so': 1, 'rushing': 1, 'if': 1}),
 Counter({'channel': 1, 'which': 1}),
 Counter({'me': 2, 'i': 2, 'your': 2, 'real': 1, 'and': 1, 'for': 1, 'co

In [98]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()

In [99]:
count_vector

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [103]:
count_vector.fit_transform(X)
count_vector.get_feature_names()

[u'00',
 u'000',
 u'000pes',
 u'008704050406',
 u'0089',
 u'0121',
 u'01223585236',
 u'01223585334',
 u'0125698789',
 u'02',
 u'0207',
 u'02072069400',
 u'02073162414',
 u'02085076972',
 u'021',
 u'03',
 u'04',
 u'0430',
 u'05',
 u'050703',
 u'0578',
 u'06',
 u'07',
 u'07008009200',
 u'07046744435',
 u'07090201529',
 u'07090298926',
 u'07099833605',
 u'07123456789',
 u'0721072',
 u'07732584351',
 u'07734396839',
 u'07742676969',
 u'07753741225',
 u'0776xxxxxxx',
 u'07781482378',
 u'07786200117',
 u'077xxx',
 u'078',
 u'07801543489',
 u'07808',
 u'07808247860',
 u'07808726822',
 u'07815296484',
 u'07821230901',
 u'078498',
 u'07880867867',
 u'0789xxxxxxx',
 u'07946746291',
 u'0796xxxxxx',
 u'07973788240',
 u'07xxxxxxxxx',
 u'08',
 u'0800',
 u'08000407165',
 u'08000776320',
 u'08000839402',
 u'08000930705',
 u'08000938767',
 u'08001950382',
 u'08002888812',
 u'08002986030',
 u'08002986906',
 u'08002988890',
 u'08006344447',
 u'0808',
 u'08081263000',
 u'08081560665',
 u'0825',
 u'083',
 

In [105]:
doc_array = count_vector.transform(X).toarray()
doc_array

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [107]:
frequency_matrix = pd.DataFrame(doc_array, columns=count_vector.get_feature_names())
frequency_matrix.head()

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,0125698789,02,...,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada,èn,ú1,〨ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [108]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8)

In [109]:
# Instantiate the CountVectorizer method
count_vector = CountVectorizer()

# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train)

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(X_test)

In [110]:
# P(D)
p_diabetes = 0.01

# P(~D)
p_no_diabetes = 0.99

# Sensitivity or P(Pos|D)
p_pos_diabetes = 0.9

# Specificity or P(Neg/~D)
p_neg_no_diabetes = 0.9

# P(Pos)
p_pos = (p_diabetes * p_pos_diabetes) + (p_no_diabetes * (1 - p_neg_no_diabetes))
print('The probability of getting a positive test result P(Pos) is: {}',format(p_pos))

The probability of getting a positive test result P(Pos) is: {} 0.108


In [111]:
# P(D|Pos)
p_diabetes_pos = (p_diabetes * p_pos_diabetes) / p_pos
print('Probability of an individual having diabetes, given that that individual got a positive test result is:\
',format(p_diabetes_pos))

Probability of an individual having diabetes, given that that individual got a positive test result is: 0.0833333333333


In [117]:
# P(Pos/~D)
p_pos_no_diabetes = 0.1

# P(~D|Pos)
p_no_diabetes_pos = (p_no_diabetes * p_pos_no_diabetes) / p_pos
print("Probability of an individual not having diabetes, given that that individual got a positive test result is: %f" % p_no_diabetes_pos)

Probability of an individual not having diabetes, given that that individual got a positive test result is: 0.916667
