In [2]:
from utils_Bayes import process_tweet, lookup
import pdb
from nltk.corpus import stopwords, twitter_samples
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import TweetTokenizer
from os import getcwd

In [3]:
nltk.download('stopwords')
nltk.download('twitter_samples')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rishabhkaushik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/rishabhkaushik/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

In [4]:
# add folder, tmp2, from our local workspace containing pre-downloaded corpora files to nltk's data path
filePath = f"{getcwd()}/../tmp2/"
nltk.data.path.append(filePath)

In [5]:
# get the sets of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

# split the data into two pieces, one for training and one for testing (validation set)
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

# avoid assumptions about the length of all_positive_tweets
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

In [6]:
custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"

# print cleaned tweet
print(process_tweet(custom_tweet))

['hello', 'great', 'day', ':)', 'good', 'morn']


In [7]:
def count_tweets(result, tweets, ys):
    '''
    Input:
        result: a dictionary that will be used to map each pair to its frequency
        tweets: a list of tweets
        ys: a list corresponding to the sentiment of each tweet (either 0 or 1)
    Output:
        result: a dictionary mapping each pair to its frequency
    '''
    for y, tweet in zip(ys, tweets):
        for word in process_tweet(tweet):
            # define the key, which is the word and label tuple
            pair = (word,y)

            # if the key exists in the dictionary, increment the count
            if pair in result:
                result[pair] += 1

            # else, if the key is new, add it to the dictionary and set the count to 1
            else:
                result[pair] = 1
    return result

In [8]:
result = {}
tweets = ['i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired']
ys = [1, 0, 0, 0, 0]
count_tweets(result, tweets, ys)

{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}

In [9]:
# Build the freqs dictionary

freqs = count_tweets({}, train_x, train_y)

In [10]:
def train_naive_bayes(freqs, train_x, train_y):
    loglikelihood = {}
    logprior = 0

    p_w_pos_sum = 0
    p_w_neg_sum = 0
    
    # calculate V, the number of unique words in the vocabulary
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)

    # calculate N_pos and N_neg
    N_pos = N_neg = 0
    for pair in freqs.keys():
        # if the label is positive (greater than zero)
        if pair[1] > 0:

            # Increment the number of positive words by the count for this (word, label) pair
            N_pos += freqs[pair]

        # else, the label is negative
        else:

            # increment the number of negative words by the count for this (word,label) pair
            N_neg += freqs[pair]

    # Calculate D, the number of documents
    D = len(train_y)

    # Calculate D_pos, the number of positive documents (*hint: use sum(<np_array>))
    D_pos = sum(train_y)

    # Calculate D_neg, the number of negative documents (*hint: compute using D and D_pos)
    D_neg = (D - D_pos)

    # Calculate logprior
    logprior = np.log(D_pos) - np.log(D_neg)

    # For each word in the vocabulary...
    for word in vocab:
        # get the positive and negative frequency of the word
        freq_pos = lookup(freqs,word,1)
        freq_neg = lookup(freqs,word,0)

        # calculate the probability that each word is positive, and negative
        p_w_pos = (freq_pos + 1)/(N_pos+V)
        p_w_neg = (freq_neg + 1)/(N_neg+V)
        
        p_w_pos_sum += p_w_pos
        p_w_neg_sum += p_w_neg
        
        #print(p_w_pos, p_w_neg)
        # calculate the log likelihood of the word
        loglikelihood[word] = np.log(p_w_pos) - np.log(p_w_neg)

       
    return logprior, loglikelihood


In [11]:
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)
print(logprior)
print(len(loglikelihood))

0.0
9086


In [12]:
loglikelihood

{'yoot': 0.698227772840216,
 'execut': -0.6880665882796748,
 'tricki': -0.6880665882796748,
 'ash': 0.698227772840216,
 'bigbang': -0.6880665882796748,
 'outfwith': -0.6880665882796748,
 'yeh': -0.6880665882796748,
 'wiw': 0.698227772840216,
 'agil': 0.698227772840216,
 '823': -0.6880665882796748,
 'academ': -0.6880665882796748,
 'imran': 0.005080592280270579,
 'mainli': 0.698227772840216,
 'mexico': 0.4105457003884343,
 'no-no': -0.6880665882796748,
 'goodi': -0.6880665882796748,
 'slr': 0.005080592280270579,
 'longgg': -0.6880665882796748,
 '28th': 0.698227772840216,
 'edm': 0.005080592280270579,
 '322': -0.6880665882796748,
 'shape': -0.6880665882796748,
 'deal': -1.0935316963878385,
 'ic': -0.6880665882796748,
 'death': 0.29276266473205226,
 'dearli': -0.6880665882796748,
 'toy': 0.4105457003884343,
 'int': -0.6880665882796748,
 '200.000': 0.698227772840216,
 'macci': -0.6880665882796748,
 't20blast': 0.005080592280270579,
 'loudest': 1.1036928809483797,
 'throat': -1.7866788769477

In [13]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
    '''
    Input:
        tweet: a string
        logprior: a number
        loglikelihood: a dictionary of words mapping to numbers
    Output:
        p: the sum of all the logliklihoods of each word in the tweet (if found in the dictionary) + logprior (a number)

    '''
    # process the tweet to get a list of words
    word_l = process_tweet(tweet)

    # initialize probability to zero
    p = 0

    # add the logprior
    p += logprior

    for word in word_l:

        # check if the word exists in the loglikelihood dictionary
        if word in loglikelihood:
            # add the log likelihood of that word to the probability
            p += loglikelihood.get(word)

    return p


In [18]:
# Experiment with your own tweet.
my_tweet = 'This place is wonderful. HEHEHE'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print('The expected output is', p)

The expected output is 2.1324236465034954


#### Implementing test_naive_bayes

* Implement `test_naive_bayes` to check the accuracy of your predictions.
* The function takes in your `test_x`, `test_y`, log_prior, and loglikelihood
* It returns the accuracy of your model.
* First, use `naive_bayes_predict` function to make predictions for each tweet in text_x.

In [19]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
    accuracy = 0  
    value = 0
    y_hats = []
    for tweet in test_x:
        # if the prediction is > 0
        if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
            # the predicted class is 1
            y_hat_i = 1
        else:
            # otherwise the predicted class is 0
            y_hat_i = 0

        # append the predicted class to the list y_hats
        y_hats.append(y_hat_i)
    
    y_hats = np.array(y_hats)
    test_y = np.squeeze(test_y)
    # error is the average of the absolute values of the differences between y_hats and test_y
    for i in range(len(y_hats)):
        error = np.abs(y_hats[i]-test_y[i])/len(y_hats)
        value = value + error

    # Accuracy is 1 minus the error
    accuracy = 1-value
    return accuracy


In [20]:
print("Naive Bayes accuracy = %0.4f" %
      (test_naive_bayes(test_x, test_y, logprior, loglikelihood)))

Naive Bayes accuracy = 0.9940


In [22]:
# Feel free to check the sentiment of your own tweet below
my_tweet = 'you are bad :('
naive_bayes_predict(my_tweet, logprior, loglikelihood)

-8.802285344803796

In [25]:
# Test with your own tweet - feel free to modify `my_tweet`
my_tweet_1 = 'I am happy because I am learning :)'
my_tweet_2 = '@AndrewYNG courses are awesome. The way he teaches math concepts are super simple to digest'
p = naive_bayes_predict(my_tweet_1, logprior, loglikelihood)
print(p)

9.573774904705935
