In [1]:
import pandas as pd

In [2]:
### Loading the Data
df=pd.read_table('smsspamcollection/SMSSpamCollection',sep='\t',header=None,names=['label','sms_message'])
df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
###Data Preprossing 
df['label']=df.label.map({'ham':0,'spam':1})
print(df.shape)
df.head()

(5572, 2)


Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
### Bag of words from scratch
documents = ['Hello, how are you!',
             'Win money, win from home.',
             'Call me now.',
             'Hello, Call hello you tomorrow?']


In [5]:
## converting to lower case
lower_case_documents=[]
for i in documents:
    lower_case_documents.append(i.lower())
print(lower_case_documents)

['hello, how are you!', 'win money, win from home.', 'call me now.', 'hello, call hello you tomorrow?']


In [6]:
## removing all punctuations
sans_punctuations_documents=[]
import string
for i in lower_case_documents:
    sans_punctuations_documents.append(i.translate(str.maketrans('','',string.punctuation)))
print(sans_punctuations_documents)    

['hello how are you', 'win money win from home', 'call me now', 'hello call hello you tomorrow']


In [7]:
### Tokenization
preprocessed_documents = []
for  i in sans_punctuations_documents:
    preprocessed_documents.append(i.split(' '))
print(preprocessed_documents)

[['hello', 'how', 'are', 'you'], ['win', 'money', 'win', 'from', 'home'], ['call', 'me', 'now'], ['hello', 'call', 'hello', 'you', 'tomorrow']]


In [8]:
### counting the frequencies
frequency_list = []
import pprint
from collections import Counter
for i in preprocessed_documents:
    frequency_counts=Counter(i)
    frequency_list.append(frequency_counts)
pprint.pprint(frequency_list)

[Counter({'hello': 1, 'how': 1, 'are': 1, 'you': 1}),
 Counter({'win': 2, 'money': 1, 'from': 1, 'home': 1}),
 Counter({'call': 1, 'me': 1, 'now': 1}),
 Counter({'hello': 2, 'call': 1, 'you': 1, 'tomorrow': 1})]


In [9]:
### using sklearn 
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()

In [10]:
print(count_vector)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)


In [11]:
count_vector.fit(documents)
count_vector.get_feature_names()

['are',
 'call',
 'from',
 'hello',
 'home',
 'how',
 'me',
 'money',
 'now',
 'tomorrow',
 'win',
 'you']

In [12]:
### creating matrix with row being each of the 4 documents,and the columns being each words.
doc_array =count_vector.transform(documents).toarray()
doc_array

array([[1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 2, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
       [0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 1]])

In [13]:
### now we convert our array into a dataframe and set the column names to word names.
frequency_matrix=pd.DataFrame(doc_array,columns=count_vector.get_feature_names())
frequency_matrix

Unnamed: 0,are,call,from,hello,home,how,me,money,now,tomorrow,win,you
0,1,0,0,1,0,1,0,0,0,0,0,1
1,0,0,1,0,1,0,0,1,0,0,2,0
2,0,1,0,0,0,0,1,0,1,0,0,0
3,0,1,0,2,0,0,0,0,0,1,0,1


In [35]:
#### Tranning and testing set
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(df['sms_message'],df['label'],random_state=1)
print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 5572
Number of rows in the training set: 4179
Number of rows in the test set: 1393


In [36]:
## Applying Bag of Words processing to the dataset
count_vector = CountVectorizer()

## fit the traning data and then returing the matrix
traninig_data = count_vector.fit_transform(X_train)
## Transform tedting data and return the matria.We are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(X_test)

In [23]:
### Bayes Theorem implementation from scratch
### Working to predict a person whose test shown postive have diabeties or not.
#p(D)
p_diabetes=0.01
#p(~D)
p_no_diabetes=0.99
#Sensitivity or p(Pos/D),True positive
p_pos_diabetes = 0.9
# Specificity or  P(Nega/~D),True negative
p_neg_no_diabetes=0.9
#p(pos)
p_pos = (p_diabetes * p_pos_diabetes)+(p_no_diabetes *(1 - p_neg_no_diabetes))
print('The probability of getting a positive test result P(Pos) is: {}',format(p_pos))

The probability of getting a positive test result P(Pos) is: {} 0.10799999999999998


In [24]:
## finding probability of a person having diabetes given that the test result show positive.
 #p(D/Pos)=(p(D) * P(Pos/D)) / P(Pos)
    
p_diabetes_pos=(p_diabetes * p_pos_diabetes)/p_pos
print('Probability of an individual having diabetes, given that that individual got a positive test result is:\
',format(p_diabetes_pos))

Probability of an individual having diabetes, given that that individual got a positive test result is: 0.08333333333333336


In [32]:
 ## we do for probability of not having diabetes when given test is positive
#In other words, compute P(~D|Pos).
#The formula is: P(~D|Pos) = (P(~D) * P(Pos|~D) / P(Pos)
#Note that P(Pos/~D) can be computed as 1 - P(Neg/~D). 
#Therefore:
#P(Pos/~D) = p_pos_no_diabetes = 1 - 0.9 = 0.1
p_pos_no_diabetes =0.1
p_no_diabetes_pos=(p_no_diabetes* p_pos_no_diabetes)/p_pos
print ('Probability of an individual not having diabetes, given that that individual got a positive test result is:'\
,p_no_diabetes_pos)

Probability of an individual not having diabetes, given that that individual got a positive test result is: 0.9166666666666669


In [37]:
 ### Naive Bayes implementation using scikit learn
    
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(traninig_data, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [40]:
## prediction
predictions = naive_bayes.predict(testing_data)

In [41]:
### Evaluate the model

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))

Accuracy score:  0.9885139985642498
Precision score:  0.9720670391061452
Recall score:  0.9405405405405406
F1 score:  0.9560439560439562
