In [None]:
## from nlp specialization and coded by trishit nath thakur

# Naive Bayes

In [None]:
from utils import process_tweet, lookup
import pdb
from nltk.corpus import stopwords, twitter_samples
import numpy as np
import pandas as pd
import nltk
import string
from nltk.tokenize import TweetTokenizer
from os import getcwd

In [None]:
nltk.download('stopwords')
nltk.download('twitter_samples')

In [None]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [None]:
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

In [None]:
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

In [None]:
## function count_tweets() that takes a list of tweets as input, cleans all of them, and returns a dictionary.

In [None]:
def count_tweets(result, tweets, ys):
    '''
    Input:
        result: a dictionary that will be used to map each pair to its frequency
        tweets: a list of tweets
        ys: a list corresponding to the sentiment of each tweet (either 0 or 1)
    Output:
        result: a dictionary mapping each pair to its frequency
    '''
    for y,tweet in zip(ys,tweets):
        for word in process_tweet(tweets):
            pair = (word, y)
            
            if pair in result:
                result[pair] += 1
            else:
                result[pair] = 1
    
    return result

In [None]:
result = {}
tweets = ['i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired']
ys = [1, 0, 0, 0, 0]
count_tweets(result, tweets, ys)

In [None]:
## train model using naive bayes

In [None]:
def train_naive_bayes(freqs, train_x, train_y):
    '''
    Input:
        freqs: dictionary from (word, label) to how often the word appears
        train_x: a list of tweets
        train_y: a list of labels correponding to the tweets (0,1)
    Output:
        logprior: the log prior
        loglikelihood: the log likelihood of you Naive bayes equation.
    '''
loglikelihood = {}
logprior = 0

# V, the number of unique words in the vocabulary

vocab = set([pair[0] for pair in freqs.keys()])
V = len(vocab)

N_pos = N_neg = V_pos = V_neg = 0

for pair in freqs.keys():
    if pair[1] > 0:
        # increment the count of unique positive words by 1
            V_pos += 1
         # Increment the number of positive words by the count for this (word, label) pair
            N_pos += freqs[pair]
    
    else:
            V_neg += 1
            N_neg += freqs[pair]
            
D = len(train_y)

# calculating number of positive & negetive documents

D_pos = (len(list(filter(lambda x: x>0, train_y))))

D_neg = (len(list(filter(lambda x: x<=0, train_y))))

# calculating the logprior

logprior = np.log(D_pos) - np.log(D_neg)

for word in vocab:
    freq_pos = lookup(freqs, word, 1)
    freq_neg = lookup(freqs, word, 0)
    
# calculate the probability that each word is positive, and negative

    p_w_pos = (freq_pos + 1)/(N_pos + V)
    p_w_neg = (freq_neg + 1)/(N_neg + V)
    
    loglikelihood[word] = np.log(p_w_pos/p_w_neg)
    
return logprior, loglikelihood

In [None]:
## testing naive bayes

In [None]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
    '''
    Input:
        tweet: a string
        logprior: a number
        loglikelihood: a dictionary of words mapping to numbers
    Output:
        p: the sum of all the logliklihoods of each word in the tweet + logprior '''
        
    word_l = process_tweet(tweet)
    
    p = 0
    
    p += logprior
    
    for word in word_l:
        if word in loglikelihood:
            p += loglikelihood[word]
    return p

In [None]:
my_tweet = 'She smiled.'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print('The expected output is', p)

In [None]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
    """
    Input:
        test_x: A list of tweets
        test_y: the corresponding labels for the list of tweets
        logprior: the logprior
        loglikelihood: a dictionary with the loglikelihoods for each word
    Output:
        accuracy: (# of tweets classified correctly)/(total # of tweets)
    """
    accuracy = 0
    
    y_hats = []
    
    for tweet in test_x:
        if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
            y_hat_i = 1
        
        else:
            y_hat_i = 0
            
        y_hats.append(y_hat_i)
    
    error = np.mean(np.absolute(y_hats - test_y))
    
    accuracy = 1 - error
    
    return accuracy
            

In [None]:
for tweet in ['I am happy', 'I am bad', 'this movie should have been great.']:
    p = naive_bayes_predict(tweet, logprior, loglikelihood)
     
    print(f'{tweet} -> {p:.2f}')

In [None]:
## Filter words by Ratio of positive to negative counts

In [None]:
def get_ratio(freqs, word):
    '''
    Input:
        freqs: dictionary containing the words

    Output: a dictionary with keys 'positive', 'negative', and 'ratio'.
        Example: {'positive': 10, 'negative': 20, 'ratio': 0.5}
    '''
    pos_neg_ratio = {'positive':0, 'negetive':0, 'ratio':0.0}
    
    pos_neg_ratio['positive'] = lookup(freqs, word, 1)
    
    pos_neg_ratio['negetive'] = lookup(freqs, word, 0)
    
    pos_neg_ratio['ratio'] = (pos_neg_ratio['positive'] + 1)/(pos_neg_ratio['negetive'] + 1)
    
    return pos_neg_ratio