# Natural Language Processing Assignment 1
## My Naive Bayes Classifier 

Name: Zoe Tagboto



In [213]:
import numpy as np
import re
from collections import Counter
from itertools import chain
import random
import string

<i> Below we define a function gathers the reviews from the file, cleans each line, and puts them in a dictionary based on the class of each review </i>

In [214]:
def read_file_to_dict(*filenames):
    all_classes = {}
    for filename in filenames:
        for line in open(filename):
            clean_line = re.sub(r"[""\n\t!'';:&*():?%$#+]","",line.lower())
            review = re.sub(r"[/,.-]"," ",clean_line)
            val = int(review[-1][-1])
            sigh = re.sub(r"[0-9]+","",review[:-1])
            if val in all_classes:
                all_classes[val].append(sigh.split())
            else:
                all_classes[val] = [sigh.split()]
    random.shuffle(all_classes[0])
    random.shuffle(all_classes[1])
    return all_classes


<i>Below we define the function to split our data into test and training sets</i>

In [215]:
def split_train_and_test(all_classes):
    total_amount = len(all_classes[0])
    train_amount = round(0.8* total_amount)
    
    test_classes =  {0:all_classes[0][train_amount:total_amount],1:all_classes[1][train_amount:total_amount]}
    train_classes = {0:all_classes[0][0:train_amount],1:all_classes[1][0:train_amount]}
    
    return test_classes, train_classes
                                                

<i> Here we define a function that creates a dictionary for each class that contains each word in the class and how often they are used </i>

In [216]:
def class_dict(all_classes, index):
    words = list(chain.from_iterable(all_classes[index]))
    type_class = Counter(words)

    return(type_class)


<i> Here we calculate the log prior which uses the following equation </i>

$$\log \frac{N_c}{N_{doc}}$$

In [217]:
def log_prior(train_classes):
    pos_log_prior = np.log(len(train_classes[0])
                         /(len(train_classes[0])+len(train_classes[1])))
    neg_log_prior = np.log(len(train_classes[1])/(len(train_classes[0])+len(train_classes[1])))
    
    return pos_log_prior, neg_log_prior
    

<i> Below we define a function that creates a dictionary with a count of every word that occurs in the file </i>

In [218]:
def complete_vocab_list(pos_dict, neg_dict):
    total_dict = pos_dict.copy()   # start with x's keys and values
    for key in neg_dict:
        if key in total_dict:
            total_dict[key] += neg_dict[key]
        else:
            total_dict[key] = neg_dict[key]
            
       
    return total_dict 

<i>Below we define the function to calculate the logLikelihood using the following equation:</i>
$$\log \frac{count(w_{i}, c)+1}{\sum_{w\in v}^{}(count(w, c)+1)}$$


In [219]:
def log_likelihood_class(total_dict,Class, word):
    if word in Class.keys():
        numerator = Class[word] +1
    else:
        numerator = 1   
    
    
    denominator= len(set(total_dict.keys()) - set(Class.keys()))+len(Class.keys())
    denominator+= sum(Class.values())
    
    log_likelihood =  np.log(numerator/denominator)
    
    
    
    return log_likelihood



In [220]:
def train(total_dict, pos_class, neg_class):
    likelihood_dict = {}
    for key in total_dict.keys():
        likelihood_dict[key]=(log_likelihood_class(total_dict,pos_class, key),log_likelihood_class(total_dict,neg_class, key))
        
    return likelihood_dict  

In [221]:
def test(test_classes):     
    percent_correct = 0 
    total = 0
    
    for line in test_classes[0]:
        is_pos = 0 
        is_neg = 0
        total+=1
        for word in line: 
            if word in likelihood_dict.keys():
                is_pos +=likelihood_dict[word][0]
                is_neg +=likelihood_dict[word][1]
        if is_neg> is_pos:
            percent_correct+=1
            
    for line in test_classes[1]:
        is_pos = 0 
        is_neg = 0
        total+=1
        for word in line: 
            if word in likelihood_dict.keys():
                is_pos +=likelihood_dict[word][0]
                is_neg +=likelihood_dict[word][1]
        if is_neg< is_pos:
            percent_correct+=1
    accuracy = (percent_correct/total)*100
    print("my accuracy is", accuracy, "woo hoo!")
    print("the total is", total)
    print("correct", percent_correct)
                
    
    

In [222]:

all_classes = read_file_to_dict('amazon_cells_labelled.txt', 'imdb_labelled.txt','yelp_labelled.txt')
train_classes, test_classes = split_train_and_test(all_classes)
pos_class = class_dict(train_classes,1)
neg_class = class_dict(train_classes,0)
total_dict = complete_vocab_list(pos_class, neg_class)
pos_log_prior, neg_log_prior = log_prior(train_classes)
likelihood_dict  = train(total_dict, pos_class, neg_class)
test(test_classes)

import collections
collections.OrderedDict(sorted(likelihood_dict.items()))
#sorted(likelihood_dict.iterkeys())

my accuracy is 75.125 woo hoo!
the total is 2400
correct 1803


OrderedDict([('"', (-7.5728458617595571, -7.2436916577714694)),
             ('"big', (-7.9783109698677217, -8.6299860188913602)),
             ('"collect"', (-8.6714581504276662, -7.9368388383314139)),
             ('"film"', (-8.6714581504276662, -7.9368388383314139)),
             ('"hello', (-8.6714581504276662, -7.9368388383314139)),
             ('"my', (-8.6714581504276662, -7.9368388383314139)),
             ('"never', (-7.9783109698677217, -8.6299860188913602)),
             ('"out', (-8.6714581504276662, -7.9368388383314139)),
             ('"real', (-8.6714581504276662, -7.9368388383314139)),
             ('"smack', (-7.9783109698677217, -8.6299860188913602)),
             ('"tiny', (-7.9783109698677217, -8.6299860188913602)),
             ('"turkey', (-7.9783109698677217, -8.6299860188913602)),
             ('"what"', (-8.6714581504276662, -7.9368388383314139)),
             ('"you', (-8.6714581504276662, -7.5313737302232502)),
             ('a', (-3.9092842156299108, -4.18