## Step 1.1: Understanding out dataset

In [1]:
import pandas as pd
dataset_form = pd.read_table('smsspamcollection/SMSSpamCollection',
                            sep='\t',
                            header=None,
                            names=['label', 'sms_message'])

In [2]:
dataset_form.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Step 1.2: Data Preprocessing

In [3]:
dataset_form['label'] = dataset_form.label.map({'ham':0, 'spam':1})
dataset_form.head()

Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


## Step 2: Bag of Words [BoW]
### 2.1: From scratch

In [4]:
documents = ['Hello, how are you!',
             'Win money, win from home.',
             'Call me now.',
             'Hello, Call hello you tomorrow?']

lower_case_documents = []
for i in documents:
    lower_case_documents.append(i.lower())
print(lower_case_documents)

['hello, how are you!', 'win money, win from home.', 'call me now.', 'hello, call hello you tomorrow?']


In [5]:
import string

sans_punctuation_documents = []
for i in lower_case_documents:
    # str not provide maketrans, so string is used instead with replacing punctuation chars with space chars which will be trimmed later
    sans_punctuation_documents.append(i.translate(string.maketrans(string.punctuation, '                                ')))
print(sans_punctuation_documents)    

['hello  how are you ', 'win money  win from home ', 'call me now ', 'hello  call hello you tomorrow ']


In [6]:
preprocessed_documents = []
for i in sans_punctuation_documents:
    preprocessed_documents.append(i.split(' '))
print(preprocessed_documents)

[['hello', '', 'how', 'are', 'you', ''], ['win', 'money', '', 'win', 'from', 'home', ''], ['call', 'me', 'now', ''], ['hello', '', 'call', 'hello', 'you', 'tomorrow', '']]


In [7]:
frequency_list = []
import pprint
from collections import Counter

for i in preprocessed_documents:
    # to remove empty strings in the list
    for j in i:              
        if j == '':
            i.remove(j)
    frequency_counts = Counter(i)
    frequency_list.append(frequency_counts)
    
pprint.pprint(frequency_list)

[Counter({'how': 1, 'you': 1, 'hello': 1, 'are': 1}),
 Counter({'win': 2, 'home': 1, 'from': 1, 'money': 1}),
 Counter({'me': 1, 'now': 1, 'call': 1}),
 Counter({'hello': 2, 'you': 1, 'call': 1, 'tomorrow': 1})]


### 2.2: With scikit-learn

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
count_vector = CountVectorizer()
print count_vector

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


In [10]:
count_vector.fit(documents)
count_vector.get_feature_names()

[u'are',
 u'call',
 u'from',
 u'hello',
 u'home',
 u'how',
 u'me',
 u'money',
 u'now',
 u'tomorrow',
 u'win',
 u'you']

In [11]:
# cols are the words , rows are the frequency in each sentence.
doc_array = count_vector.transform(documents).toarray()
doc_array

array([[1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 2, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
       [0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 1]])

In [12]:
frequency_matrix = pd.DataFrame(doc_array, columns=count_vector.get_feature_names())

In [13]:
frequency_matrix

Unnamed: 0,are,call,from,hello,home,how,me,money,now,tomorrow,win,you
0,1,0,0,1,0,1,0,0,0,0,0,1
1,0,0,1,0,1,0,0,1,0,0,2,0
2,0,1,0,0,0,0,1,0,1,0,0,0
3,0,1,0,2,0,0,0,0,0,1,0,1


***
## Step 3.1: Training and testing data

In [14]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset_form['sms_message'], dataset_form['label'], random_state=1, test_size=0.2)

print('Dataset form size = {}'.format(dataset_form.shape[0]))
print('X_train size = {} , test size = {}'.format(X_train.shape[0], X_test.shape[0]))
print('y_train size = {} , test size = {}'.format(y_train.shape[0], y_test.shape[0]))

Dataset form size = 5572
X_train size = 4457 , test size = 1115
y_train size = 4457 , test size = 1115




## Step 3.2: Applying Bag of Words processing to our dataset

In [15]:
count_vector = CountVectorizer()
training_data = count_vector.fit_transform(X_train)
testing_data = count_vector.transform(X_test)

***
## Step 5: Implementing Naive Bayes on data

In [16]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [17]:
predictions = naive_bayes.predict(testing_data)

***
## Step 6: Evaluating our model

In [18]:
predictions

array([0, 0, 0, ..., 0, 0, 0])

In [19]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy: {}'.format(accuracy_score(y_test, predictions)))
print('Precision: {}'.format(precision_score(y_test,predictions)))
print('Recall: {}'.format(recall_score(y_test, predictions)))
print('F1: {}'.format(f1_score(y_test, predictions)))

Accuracy: 0.990134529148
Precision: 0.978873239437
Recall: 0.945578231293
F1: 0.961937716263
