In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [7]:
df = pd.read_csv('spam.csv', encoding = 'latin=1')

print(df.head())
df.shape

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


(5572, 5)

In [8]:
from sklearn.model_selection import train_test_split

data_train, data_test, labels_train, labels_test = train_test_split(
                                                                    df.v2,
                                                                    df.v1,
                                                                    test_size = 0.2,
                                                                    random_state = 0)
print(data_train.head())
print(labels_train.head())

1114    No no:)this is kallis home ground.amla home to...
3589    I am in escape theatre now. . Going to watch K...
3095    We walked from my moms. Right on stagwood pass...
1012       I dunno they close oredi not... ÌÏ v ma fan...
3320                               Yo im right by yo work
Name: v2, dtype: object
1114    ham
3589    ham
3095    ham
1012    ham
3320    ham
Name: v1, dtype: object


In [13]:
# count vocabulary 
def fit(data):
    vocab_set = set([])
    for doc in data:
#         print(doc)
        words = doc.split()
        for word in words:
            vocab_set.add(word)
    return list(vocab_set)
            
#     print (words)
vocab_list = fit(data_train)
# print(vocab_list)

In [30]:
# transform vocab count
def transform(vocab_list, data):
    word_vector = np.zeros(len(vocab_list))
    words = data.split()
    for word in words:
        if word in vocab_list:
            word_vector[vocab_list.index(word)] += 1
    return word_vector

#small test
res = transform(vocab_list, "a great problem")
print (res)
print (res.sum(axis = 0))

#transform training data
print(type(data_train))
train_matrix = []
for doc in data_train.values:
    word_vector = transform(vocab_list, doc)
    train_matrix.append(word_vector)
    
#CountVectorizer

[ 0.  0.  0. ...,  0.  0.  0.]
3.0
<class 'pandas.core.series.Series'>


In [87]:
def NaiveBayes_train(train_matrix, label_train):
    num_docs = len(train_matrix)
    num_words = len(train_matrix[0])
    
    spam_vector_count = np.ones(num_words)
    ham_vector_count = np.ones(num_words)
    spam_total_count = num_words
    ham_total_count = num_words
    
    spam_count = 0
    ham_count = 0
#     print(len(label_train))
#     print(num_docs)
#     print(str(label_train[4]))
    for i in range(num_docs):
#         print(i)
#         print(label_train[i])
        if(label_train[i] == "spam"):
#             print('test')
#             print(label_train[i])
#             print(spam_vector_count)
#             print(train_matrix[i])
#             print(len(spam_vector_count))
#             print(len(train_matrix[i]))
            spam_vector_count += train_matrix[i]
            spam_total_count += sum(train_matrix[i])
            spam_count += 1
#             print('test2')
        else:
#             print('test3')
            ham_vector_count += train_matrix[i]
            ham_total_count += sum(train_matrix[i])
            ham_count += 1
#             print('test4')
#     print('test5')
    p_spam_vector = spam_vector_count / spam_total_count
    p_ham_vector = ham_vector_count / ham_total_count
    p_spam = spam_count/num_docs
    p_ham = ham_count/num_docs
    
    return np.log(p_spam_vector), np.log(p_ham_vector), np.log(p_spam), np.log(p_ham)
    
# print(len(labels_train))
# print(labels_train)
# print(len(train_matrix))
# print(len(train_matrix[0]))

p_spam_vector, p_ham_vector, p_spam, p_ham = NaiveBayes_train(train_matrix, labels_train.values)

4457
4457
ham


In [91]:
def predict(p_spam_vector, p_ham_vector, p_spam, p_ham, test_word_vector):
    #count ham/spam prob
    spam = sum(test_word_vector * p_spam_vector) + p_spam
    ham = sum(test_word_vector * p_ham_vector) + p_ham
    #compare
    if spam > ham:
        return 'spam'
    else:
        return 'ham'
    
predictions = []
for doc in data_test.values:
    test_word_vector = transform(vocab_list, doc)
    pred = predict(p_spam_vector, p_ham_vector, p_spam, p_ham, test_word_vector)
    predictions.append(pred)

In [93]:
print(len(predictions))
print(predictions)

1115
['ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'spam', 'ham', 'spam', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'spam', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 

In [94]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score

accuracy_score(labels_test, predictions)
print(classification_report(labels_test, predictions))
print(confusion_matrix(labels_test, predictions))

             precision    recall  f1-score   support

        ham       0.98      1.00      0.99       949
       spam       0.98      0.86      0.92       166

avg / total       0.98      0.98      0.98      1115

[[946   3]
 [ 23 143]]
