# Naive Bayes

In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict,Counter

In [2]:
data = pd.read_csv("data/raw_data.csv")
data = data[['text','y']]

In [3]:
data.head()

Unnamed: 0,text,y
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1


# Build Naive Bayes from scratch in Python

In [4]:
#  split data to train ,test set
def train_test(data,number_sample):
    if(number_sample > len(data)):
        number_sample = len(data)
    number_sample = int(number_sample / 2)
    pos_sample = data[:number_sample]
    neg_sample = data[-number_sample:]
    final_sample = [pos_sample,neg_sample]
    df_final_sample = pd.concat(final_sample)
    df_final_sample = df_final_sample[['text','y']].sample(frac=1)
    
    msk = np.random.rand(len(df_final_sample)) < 0.8
    train_data = df_final_sample[msk]
    test_data = df_final_sample[~msk]
    test_data = test_data[['text','y']]
    return train_data,test_data

In [5]:
# get word dictionary and some statistics
def get_dict(data):
    pos_vocab = defaultdict(float)
    neg_vocab = defaultdict(float)
    pos_count = 0
    neg_count = 0
    pos_total_words = 0
    neg_total_words = 0
    
    for row in data.itertuples(False):
        sen = row[0]
        label = row[1]
        if label is 0:
            words = set(sen.split())
            for w in words:
                neg_total_words += 1
                if w not in neg_vocab:
                    neg_count += 1
                neg_vocab[w] += 1
        else:
            words = set(sen.split())
            for w in words:
                pos_total_words += 1
                if w not in pos_vocab:
                    pos_count += 1
                pos_vocab[w] += 1
    # word dictionary for positive vocaburary and negative vocaburary
    vocabs = (pos_vocab,neg_vocab)
    
    # uniqe word
    counts = (pos_count,neg_count)
    
    # total number word
    total_words = (pos_total_words,neg_total_words)
    return vocabs,counts,total_words                     

In [6]:
# get probability of a word in each dictionary (positive,negative)
def get_word_prob(word,arg):
    dic,count,total = arg
    w_freq = 0
    if word in dic:
        w_freq = dic[word]
    return (1 + w_freq) / (count + total)

In [7]:
# get probability of a sentence in each dictionary (positive,negative)
def get_sentence_prob(s,arg):
    dic,count,total = arg
    prob = 1
    for word in set(s.split()):
        prob *= get_word_prob(word,arg)
    return prob

In [8]:
# get probability of a sentence in each class
def get_prob(s,vocabs,counts,total_words):
    pos_vocab,neg_vocab = vocabs
    pos_count,neg_count = counts
    pos_total_words,neg_total_words = total_words
    
    pos_arg = (pos_vocab,pos_count,pos_total_words)
    neg_arg = (neg_vocab,neg_count,neg_total_words)
    
    pos_prob = get_sentence_prob(s,pos_arg)
    neg_prob = get_sentence_prob(s,neg_arg)
    
    NB_prob = {}
    NB_prob['pos'] = pos_prob / (pos_prob + neg_prob)
    NB_prob['neg'] = 1 - NB_prob['pos']
    return NB_prob      

In [9]:
# predict data
def predict(test_data,vocabs,counts,total_words):
    NB_prob = get_prob(test_data[0],vocabs,counts,total_words)
    rs = (1 if NB_prob['pos'] > NB_prob['neg'] else 0 )
    if rs == test_data[1]:
        return True
    return False        

In [10]:
# get something
def get_metric(test_data,vocabs,counts,total_words):
    cnt = Counter()
    for row in test_data.itertuples(False):
        if(predict(row,vocabs,counts,total_words)):
            cnt['True'] += 1
        else:
            cnt['False'] += 1
    return cnt,(cnt['True'] / (cnt['True'] + cnt['False']))  

In [11]:
# create data
number_sample = 10000
original_data = pd.read_csv("data/processed_data.csv")
train_data,test_data = train_test(original_data,number_sample)
vocabs,counts,total_words = get_dict(train_data)

# get accuracy metric
counter,acc = get_metric(test_data,vocabs,counts,total_words)

# display some prob result
for row in test_data[0:10].itertuples(False):
    print(predict(row,vocabs,counts,total_words))
    print(get_prob(row[0],vocabs,counts,total_words))
    print("real label : ", row[1])
print("="*80)
print("accuracy :", acc)
print(counter)


True
{'pos': 0.011074368445888398, 'neg': 0.9889256315541116}
real label :  0
True
{'pos': 0.14335148760538152, 'neg': 0.8566485123946185}
real label :  0
True
{'pos': 0.692711822642008, 'neg': 0.30728817735799197}
real label :  1
False
{'pos': 0.4761613975022759, 'neg': 0.5238386024977242}
real label :  1
False
{'pos': 0.7514528809204597, 'neg': 0.2485471190795403}
real label :  0
True
{'pos': 0.9920810475518269, 'neg': 0.007918952448173089}
real label :  1
True
{'pos': 0.6301972495807876, 'neg': 0.3698027504192124}
real label :  1
False
{'pos': 0.0840421551448212, 'neg': 0.9159578448551788}
real label :  1
False
{'pos': 0.18948698730374697, 'neg': 0.810513012696253}
real label :  1
True
{'pos': 0.846310162320438, 'neg': 0.153689837679562}
real label :  1
accuracy : 0.777667493796526
Counter({'True': 1567, 'False': 448})


# Use Naive Bayes model in Scikit-Learn

In [20]:
from sklearn.feature_extraction.text import HashingVectorizer,CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [33]:
# use hash table to hash data
vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False)
X_train = vectorizer.transform(train_data['text'])
X_test = vectorizer.transform(test_data['text'])
y_train,y_test = train_data['y'],test_data['y']

In [34]:
# use library
clf = MultinomialNB(alpha=.01)
clf.fit(X_train,y_train)
pred = clf.predict(X_test)
score = metrics.accuracy_score(y_test, pred)
print("Accuracy : ",score)

Accuracy :  0.7478908188585608


## Conclusion

My model use optimal Bag of word got  a little bit accuracy than sklearn model use hashing table with default 1048576 features but it takes more times