## Naive Bayes Classifier

In [103]:
import pandas as pd

df = pd.read_table('SMSSpamCollection'
                  ,sep = "\t"
                  ,header = None
                  ,names = ['label','message'])

df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [90]:
df.shape

(5572, 2)

In [91]:
df['label'] = df.replace({'ham':0,'spam':1})

df.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


### Different Steps to implement

1) Convert all strings to lower case form

2) Remove all punctuations and stopwords

3) Tokenization - Split a sentence into individual words using a delimiter

4) Count frequencies of each word and store it in a document term matrix


In [92]:
from sklearn.feature_extraction.text import CountVectorizer

count_vector = CountVectorizer()
count_vector

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

##### Split our data set into training and testing dataset

In [93]:
from sklearn.cross_validation import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(df['message'],
                                                    df['label'],
                                                   random_state = 1)

In [94]:
print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(Y_train.shape[0]))
print('Number of rows in the test set: {} '.format(Y_test.shape[0]))

Number of rows in the total set: 5572
Number of rows in the training set: 4179
Number of rows in the test set: 1393 


In [95]:
# Instantiate the CountVectorizer method
count_vector = CountVectorizer()

# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train)

# Transform testing data and return the matrix. Note, in training data we are learning a vocabulary dictionary for the training 
# data and then transforming the data into a DTM; here we are not fitting the testing data into the CountVectorizer(), we are 
# only transforming the data into a DTM
testing_data = count_vector.transform(X_test)

In [96]:
import numpy as np
print (np.matrix(training_data))

[[ <4179x7456 sparse matrix of type '<class 'numpy.int64'>'
	with 55209 stored elements in Compressed Sparse Row format>]]


In [97]:
training_data.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ..., 
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

## Implement Naive Bayes 

Converting the format and datatype

In [98]:
Y_train = np.asarray(Y_train, dtype="|S6")
Y_test = np.asarray(Y_test, dtype="|S4")

In [99]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()

naive_bayes.fit(training_data, Y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [100]:
predictions = naive_bayes.predict(testing_data)

In [101]:
predictions

array([b'0', b'0', b'0', ..., b'0', b'1', b'0'], 
      dtype='|S4')

### Evaluating the model

In [102]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print('Accuracy score: ', format(accuracy_score(Y_test, predictions)))


Accuracy score:  0.9885139985642498
