<a href="https://colab.research.google.com/github/waqqasansari/Machine_Learning_Probs/blob/master/Naive_Bayes_Scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install pdp

Collecting pdp
  Downloading https://files.pythonhosted.org/packages/e4/f9/5e4886980fd2a86013055142f9f3c9f94d3205495a0603a55d5eae32ac9d/pdp-0.3.0.tar.gz
Building wheels for collected packages: pdp
  Building wheel for pdp (setup.py) ... [?25l[?25hdone
  Created wheel for pdp: filename=pdp-0.3.0-cp36-none-any.whl size=6603 sha256=99339de78856319006e99cd8be104fbe92f9d5d1a163baef4dc3ee75ce160032
  Stored in directory: /root/.cache/pip/wheels/84/fb/5e/afb783110614b3c1a4187e6f83e4f4ea0088fbdb82921013a2
Successfully built pdp
Installing collected packages: pdp
Successfully installed pdp-0.3.0


In [None]:
import pdp
from nltk.corpus import stopwords, twitter_samples
import numpy as np
import pandas as pd
import nltk
import string
from nltk.tokenize import TweetTokenizer

In [None]:
nltk.download('stopwords')
nltk.download('twitter_samples')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

In [None]:
# get the sets of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

# split the data into two pieces, one for training and one for testing (validation set)
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

# avoid assumptions about the length of all_positive_tweets
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

In [None]:
print(len(train_y))
print(test_y)

8000
[1. 1. 1. ... 0. 0. 0.]


In [None]:
import re
import string
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer


def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

In [None]:
def count_tweets(result, tweets, ys):
    '''
    Input:
        result: a dictionary that will be used to map each pair to its frequency
        tweets: a list of tweets
        ys: a list corresponding to the sentiment of each tweet (either 0 or 1)
    Output:
        result: a dictionary mapping each pair to its frequency
    '''
    for y, tweet in zip(ys, tweets):
        for word in process_tweet(tweet):
            # define the key, which is the word and label tuple
            pair = (word, y)

            # if the key exists in the dictionary, increment the count
            if pair in result:
                result[pair] += 1

            # else, if the key is new, add it to the dictionary and set the count to 1
            else:
                result[pair] = 1
    return result

In [None]:
# Testing your function
result = {}
tweets = ['i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired']
ys = [1, 0, 0, 0, 0]
count_tweets(result, tweets, ys)

{('happi', 1): 1, ('sad', 0): 1, ('tire', 0): 2, ('trick', 0): 1}

In [None]:
# Build the freqs dictionary for later uses
freqs = count_tweets({}, train_x, train_y)

In [None]:
def lookup(freqs, word, label):
    '''
    Input:
        freqs: a dictionary with the frequency of each pair (or tuple)
        word: the word to look up
        label: the label corresponding to the word
    Output:
        n: the number of times the word with its corresponding label appears.
    '''
    n = 0  # freqs.get((word, label), 0)

    pair = (word, label)
    if (pair in freqs):
        n = freqs[pair]

    return n

In [None]:
def train_naive_bayes(freqs, train_x, train_y):

  loglikelihood = {}
  logprior = 0

  vocab = set([pair[0] for pair in freqs.keys()])
  V = len(vocab)

  N_pos = N_neg = 0
  for pair in freqs.keys():
    if pair[1] > 0:
      N_pos += freqs[pair]
    else:
      N_neg += freqs[pair]

  D = len(train_y)

  D_pos = len(list(filter(lambda x: x > 0, train_y)))
  D_neg = len(list(filter(lambda x: x <= 0, train_y)))

  logprior = np.log(D_pos) - np.log(D_neg)

  for word in vocab:

    freq_pos = lookup(freqs,word,1)
    freq_neg = lookup(freqs,word,0)

    p_w_pos = (freq_pos + 1) / (N_pos + V)
    p_w_neg = (freq_neg + 1) / (N_neg + V)

    loglikelihood[word] = np.log(p_w_pos/p_w_neg)

  return logprior, loglikelihood

In [None]:
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)
print(logprior)
print(len(loglikelihood))

0.0
9089


In [None]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
  '''
    Input:
        tweet: a string
        logprior: a number
        loglikelihood: a dictionary of words mapping to numbers
    Output:
        p: the sum of all the logliklihoods of each word in the tweet (if found in the dictionary) + logprior (a number)

    '''
  word_l = process_tweet(tweet)

  p = 0
  p += logprior

  for word in word_l:
    if word in loglikelihood:
      p += loglikelihood[word]

  return p

In [None]:
# Experiment with your own tweet.
my_tweet = 'She smiled.'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print('The expected output is', p)

The expected output is 1.5740278623499175


In [None]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):

  accuracy = 0

  y_hats = []

  for tweet in test_x:

    if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
      y_hat_i = 1
    else:

      y_hat_i = 0

    y_hats.append(y_hat_i)

  error = np.mean(np.absolute(y_hats-test_y))

  accuracy = (1 - error)

  return accuracy

In [None]:
print("Naive Bayes accuracy = %0.4f" %
      (test_naive_bayes(test_x, test_y, logprior, loglikelihood)))

Naive Bayes accuracy = 0.9940
