In [11]:
import os
import re
import nltk
import string
import numpy as np
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import twitter_samples 

nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /home/vikash/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to /home/vikash/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
positive_tweet_list = twitter_samples.strings('positive_tweets.json')
negative_tweet_list = twitter_samples.strings('negative_tweets.json')

print("All positive tweets length: ", len(positive_tweet_list))
print("All negative tweets length: ", len(negative_tweet_list))

All positive tweets length:  5000
All negative tweets length:  5000


In [13]:
# preprocess tweet
stemmer = PorterStemmer()
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

stopwords_eng = stopwords.words("english")
punctuations_eng = string.punctuation

def preprocess_tweet(tweet):
    # remove stock market tickers like $GE
    tweet = re.sub(r"\$\w*", "", tweet)
    
    # remove old style retweet text "RT"
    tweet = re.sub(r"^RT[\s]+", "", tweet)
    
    # remove hyperlinks
    tweet = re.sub(r"https?:\/\/[.a-zA-Z\/-]*[\r\n]*", "", tweet)
    
    # remove the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    
    tweet_tokens = tokenizer.tokenize(tweet)
    
    tweet_tokens_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_eng and word not in punctuations_eng):
            stemmed_word = stemmer.stem(word)
            
            tweet_tokens_clean.append(stemmed_word)
    
    return tweet_tokens_clean

In [41]:
# get word frequencies

def generate_words_frequency(tweet_list, vocab):
    frequency_dict = {}
    for tweet in tweet_list:
        tweet_tokens = preprocess_tweet(tweet)
        for word in tweet_tokens:
            if word not in frequency_dict:
                frequency_dict[word] = 1
            else:
                frequency_dict[word] += 1
                
    for word in vocab:
        if word not in frequency_dict:
            frequency_dict[word] = 0
                
    return frequency_dict

In [42]:
train_pos, test_pos = positive_tweet_list[:4000], positive_tweet_list[4000:]
train_neg, test_neg = negative_tweet_list[:4000], negative_tweet_list[4000:]

vocab = set()
train_x = train_pos + train_neg
for tweet in train_x:
    tweet_tokens = preprocess_tweet(tweet)
    for word in tweet_tokens:
        if word not in vocab:
            vocab.add(word)
            
vocab = list(vocab)
pos_frequency_dict = generate_words_frequency(train_pos, vocab)
neg_frequency_dict = generate_words_frequency(train_neg, vocab)

In [43]:
# no of unique word in the vocab
V = len(vocab)

# calculate N_class for both positive and negative classes
N_pos = np.sum([v for k, v in pos_frequency_dict.items()])
N_neg = np.sum([v for k, v in neg_frequency_dict.items()])

In [44]:
# Laplacian smoothing
prob_w_given_pos_dict = {k: (v + 1)/(N_pos + V) for k, v in pos_frequency_dict.items()}
prob_w_given_neg_dict = {k: (v + 1)/(N_pos + V) for k, v in neg_frequency_dict.items()}

In [45]:
# calculate lambda
lambda_dict = {k: np.log(v/prob_w_given_neg_dict.get(k)) for k, v in prob_w_given_pos_dict.items()}

In [46]:
# log_prior is log(#positive_tweets / #negative_tweets)
log_prior = np.log(len(train_pos)/len(train_neg))

In [47]:
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg))).reshape(len(test_pos)+len(test_neg), -1)

In [50]:
def evaluate_performence(test_x, test_y, lambda_dict, log_prior):
    preds = np.zeros((len(test_x), 1))
    for idx, tweet in enumerate(test_x):
        tweet_tokens = preprocess_tweet(tweet)
        res = log_prior
        for word in tweet_tokens:
            if word in lambda_dict:
                res += lambda_dict[word]
        
        if res > 0:
            preds[idx, 0] = 1
        
            
    correct = np.sum(test_y == preds)
    
    return correct * 100/len(test_x)
            
evaluate_performence(test_pos + test_neg, test_y, lambda_dict, log_prior)      

99.55

In [52]:
def predict(tweet, lambda_dict, log_prior):
    tweet_tokens = preprocess_tweet(tweet)
    res = log_prior
    for word in tweet_tokens:
        if word in lambda_dict:
            res += lambda_dict[word]
                
    if res > 0:
        print("Positive Sentiment")
    else:
        print("Negative Sentiment")

In [54]:
predict("I am happy", lambda_dict, log_prior)

Positive Sentiment


In [55]:
predict("this movie should have been great", lambda_dict, log_prior)

Positive Sentiment


In [56]:
predict("this movie is really bad", lambda_dict, log_prior)

Negative Sentiment
