In [2]:
from sklearn import datasets
from sklearn import metrics
import numpy as np
import pandas as pd
import math
import random
from sklearn.naive_bayes import GaussianNB

In [3]:
feature_names = np.array(['word', 'negative', 'positive']) # List of feature names

In [4]:
# Reading in the data points
data_file = open('simple-food-reviews.txt', 'r')
data_string = data_file.read()
full_data = np.array(data_string.split('\n'))
np.random.shuffle(full_data)
training_data = full_data[0:12]
test_data = full_data[13:19]

In [5]:
def word_count(data):
    training_data = ""
    for i in range(len(data)):
        training_data += data[i]+ " "
    
    training_data = training_data.split()
    arr, counts = np.unique(training_data, return_counts = True)
    return (arr, counts)

word_count(training_data)


(array(['-1', '1', 'a', 'and', 'at', 'avoid', 'bad', 'enjoyed',
        'experience', 'food', 'great', 'had', 'i', 'is', 'lovely', 'meal',
        'my', 'really', 'restaurant', 'service', 'tasted', 'terrible',
        'the', 'this', 'was', 'we', 'what'], dtype='<U10'),
 array([7, 5, 4, 1, 1, 2, 2, 1, 1, 4, 2, 1, 1, 3, 2, 2, 1, 1, 5, 2, 1, 3,
        8, 3, 3, 1, 1]))

In [10]:
#Calculate Priors
def priors(data):
    words, total_wc = word_count(data)
    words = words[2:]
    total_wc = total_wc[2:]
    
    neg = "-"
    negative_review = " "
    positive_review = " "
    p_c = 0
    n_c = 0
    wc = np.zeros(words.shape[0])
    
    #From the training data we want to seperate neg and pos rev's
    i = 0
    while i < len(data):
        if data[i][0] == neg:
            negative_review += data[i] + " "
            i += 1
            n_c += 1
        else:
            positive_review += data[i] + " "
            i += 1
            p_c += 1

    #Overall Probabilities
    prob_neg_rev = n_c/len(data)
    prob_pos_rev = p_c/len(data)

    nr, nr_c = np.unique(negative_review.split(), return_counts=True)
    pr, pr_c = np.unique(positive_review.split(), return_counts=True)

    #We want to ignore '-1' and '1'
    nr = nr[1:]
    nr_c = nr_c[1:]
    pr = pr[1:]
    pr_c = pr_c[1:]
    
    prob_neg = np.zeros(words.shape[0])
    prob_pos = np.zeros(words.shape[0])
    
    for i in range(len(words)):
        for j in range(len(nr)):
            if words[i] == nr[j]:
                prob_neg[i] = nr_c[j]/total_wc[i]
    
    for i in range(len(words)):
        for j in range(len(pr)):
            if words[i] == pr[j]:
                prob_pos[i] = pr_c[j]/total_wc[i]
     
    return prob_neg_rev, prob_pos_rev, prob_pos, prob_neg

priors(training_data)  

(0.5833333333333334,
 0.4166666666666667,
 array([0.75      , 0.        , 1.        , 0.        , 0.        ,
        1.        , 1.        , 0.25      , 1.        , 1.        ,
        1.        , 0.33333333, 1.        , 0.5       , 1.        ,
        0.        , 0.6       , 0.        , 1.        , 0.        ,
        0.25      , 0.33333333, 0.        , 1.        , 1.        ]),
 array([0.25      , 1.        , 0.        , 1.        , 1.        ,
        0.        , 0.        , 0.75      , 0.        , 0.        ,
        0.        , 0.66666667, 0.        , 0.5       , 0.        ,
        1.        , 0.4       , 1.        , 0.        , 1.        ,
        0.75      , 0.66666667, 1.        , 0.        , 0.        ]))

In [12]:
#Differentiate Reviews and ignore '1' and '-1'
words, total_wc = word_count(training_data)
words = words[2:]
total_wc = total_wc[2:]

neg = "-"
negative_review = " "
positive_review = " "
wc = np.zeros(words.shape[0])


#From the training data we want to seperate neg and pos rev's
for i in range(len(training_data)):
    if training_data[i][0] == neg:
        negative_review += training_data[i] + " "
    else:
        positive_review += training_data[i] + " "

#Now we want to count and compare

nr, nr_c = np.unique(negative_review.split(), return_counts=True)
pr, pr_c = np.unique(positive_review.split(), return_counts=True)

nr = nr[1:]
nr_c = nr_c[1:]
pr = pr[1:]
pr_c = pr_c[1:]
print(wc)
print()

print(words)
print(total_wc)

print()

print(nr)
print(nr_c)

print()

print(pr)
print(pr_c)
print()

prob_neg = np.zeros(words.shape[0])
prob_pos = np.zeros(words.shape[0])

for i in range(len(words)):
    for j in range(len(nr)):
        if words[i] == nr[j]:
            prob_neg[i] = nr_c[j]/total_wc[i]
    
    for j in range(len(pr)):
        if words[i] == pr[j]:
            prob_pos[i] = pr_c[j]/total_wc[i]

print(prob_neg)
print()
print(prob_pos)
print("\n")
print(prob_neg + prob_pos)


[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]

['a' 'and' 'at' 'avoid' 'bad' 'enjoyed' 'experience' 'food' 'great' 'had'
 'i' 'is' 'lovely' 'meal' 'my' 'really' 'restaurant' 'service' 'tasted'
 'terrible' 'the' 'this' 'was' 'we' 'what']
[4 1 1 2 2 1 1 4 2 1 1 3 2 2 1 1 5 2 1 3 8 3 3 1 1]

['a' 'and' 'avoid' 'bad' 'food' 'is' 'meal' 'really' 'restaurant'
 'service' 'terrible' 'the' 'this' 'was']
[1 1 2 2 3 2 1 1 2 2 3 6 2 3]

['a' 'at' 'enjoyed' 'experience' 'food' 'great' 'had' 'i' 'is' 'lovely'
 'meal' 'my' 'restaurant' 'tasted' 'the' 'this' 'we' 'what']
[3 1 1 1 1 2 1 1 1 2 1 1 3 1 2 1 1 1]

[0.25       1.         0.         1.         1.         0.
 0.         0.75       0.         0.         0.         0.66666667
 0.         0.5        0.         1.         0.4        1.
 0.         1.         0.75       0.66666667 1.         0.
 0.        ]

[0.75       0.         1.         0.         0.         1.
 1.         0.25       1.         1.         1.   

In [91]:
def do_laplace_smoothing(data):
    laplace_smoothing = 1.0
    negative_review = " "
    positive_review = " "
    wc = np.zeros(words.shape[0])
    
    for i in range(len(data)):
        if data[i][0] == neg:
            negative_review += data[i] + " "
        else:
            positive_review += data[i] + " "
            
    nr, nr_c = np.unique(negative_review.split(), return_counts=True)
    pr, pr_c = np.unique(positive_review.split(), return_counts=True)
    
    return len(nr)
   
do_laplace_smoothing(training_data)

18

In [26]:
np.random.shuffle(full_data)
training_data = np.array(full_data[slice(12)])
print(training_data, len(training_data))

['1 i enjoyed the experience at the restaurant' '1 the service was great'
 '1 i really enjoyed my food' '-1 the meal was terrible'
 '-1 this is a bad restaurant  ' '-1 avoid this restaurant'
 '1 the food is lovely'
 '1 the food the service and the restaurant was great'
 '1 my food tasted great' '-1 the food was really bad'
 '1 what a lovely restaurant' '-1 the service is terrible'] 12


In [66]:
negative, positive = count_appearance(training_data)

In [67]:
prob_neg = negative/len(training_data)
prob_pos = positive/len(training_data)


In [68]:
print(prob_neg)
print(prob_pos)

0.4166666666666667
0.5833333333333334


In [85]:
def word_prob(data):
    c_lovely = 0
    c_enjoyed = 0
    c_great = 0
    c_avoid = 0
    c_terrible = 0
    c_awful = 0
    c_bad = 0
    
    for i in range(len(data)):
        if('lovely' in data[i]):
            c_lovely = c_lovely + 1
        elif('great' in data[i]):
            c_great = c_great + 1
        elif('enjoyed' in data[i]):
            c_enjoyed = c_enjoyed + 1
        elif('terrible' in data[i]):
            c_terrible = c_terrible + 1
        elif('awful' in data[i]):
            c_awful = c_awful + 1
        elif('bad' in data[i]):
            c_bad = c_bad + 1
        elif('avoid' in data[i]):
            c_avoid = c_avoid + 1
    
    return c_lovely, c_enjoyed, c_great, c_avoid, c_terrible, c_awful, c_bad

In [86]:
word_prob(training_data)

(2, 2, 3, 1, 2, 0, 2)