# Tweet Sentiment Classification

Build and train models to classify tweet sentiment as positive or negative using `Logistic Regression` and `Naïve Bayes` classifiers

In [192]:
%load_ext autoreload
%autoreload 2

import itertools
import numpy as np
import nltk
from nltk.corpus import twitter_samples
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import utils

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Download and explore data

In [193]:
# Download twitter samples and unzip 
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [194]:
# Load postive and negative tweets from JSON to list
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

In [195]:
len(positive_tweets), len(negative_tweets)

(5000, 5000)

In [196]:
# View few samples tweets
print('Positive Tweets...\n-------------------')
for i, tweet in enumerate(positive_tweets[:5]): print(f'Tweet {i+1}: ', tweet)
print('\nNegative Tweets...\n-------------------')
for i, tweet in enumerate(negative_tweets[:5]): print(f'Tweet {i+1}: ', tweet)

Positive Tweets...
-------------------
Tweet 1:  #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
Tweet 2:  @Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!
Tweet 3:  @DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!
Tweet 4:  @97sides CONGRATS :)
Tweet 5:  yeaaaah yippppy!!!  my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days

Negative Tweets...
-------------------
Tweet 1:  hopeless for tmr :(
Tweet 2:  Everything in the kids section of IKEA is so cute. Shame I'm nearly 19 in 2 months :(
Tweet 3:  @Hegelbon That heart sliding into the waste basket. :(
Tweet 4:  “@ketchBurning: I hate Japanese call him "bani" :( :(”

Me too
Tweet 5:  Dang starting next week I have "work" :(


## Train Test Split

In [197]:
# 80% train (4000 tweets) and 20% test (1000 tweets)
train_pos_tweets = positive_tweets[:4000]
test_pos_tweets = positive_tweets[4000:]
train_neg_tweets = negative_tweets[:4000]
test_neg_tweets = negative_tweets[4000:]

## Preprocess data
Tweets needs to be cleaned and encoded to make them ready for modeling

In [198]:
# Load required meta data for utils routines
utils.init()

In [199]:
# Loop through all training tweets, clean and tokenize them
train_pos_tweets = [ utils.clean_and_tokenize_tweet(tweet) 
                     for tweet in train_pos_tweets
                   ]
train_neg_tweets = [ utils.clean_and_tokenize_tweet(tweet) 
                     for tweet in train_neg_tweets
                   ]

# Loop through all test tweets, clean and tokenize them
test_pos_tweets = [ utils.clean_and_tokenize_tweet(tweet) 
                     for tweet in test_pos_tweets
                   ]
test_neg_tweets = [ utils.clean_and_tokenize_tweet(tweet) 
                     for tweet in test_neg_tweets
                   ]
# Random shuffle
np.random.seed(42)
np.random.shuffle(train_pos_tweets)
np.random.shuffle(train_neg_tweets)
np.random.shuffle(test_pos_tweets)
np.random.shuffle(test_neg_tweets)

### Encode tweets and extract features

In [200]:
# 1) Build frequency dictionary using training set...
pos_freq_dict = utils.build_freq_dict(train_pos_tweets) 
neg_freq_dict = utils.build_freq_dict(train_neg_tweets)

# Store frequencies for latter use
utils.meta_data['pos_freq_dict'] = pos_freq_dict
utils.meta_data['neg_freq_dict'] = neg_freq_dict

In [201]:
# 2) Encode each tweet in corpus as features [bias (always 1), 
#    sum-of-positive-freq, sum-of-positive-freq] then build a matrix 
#    containing all encoded tweets and a vector containing corresponding 
#    labels...

# process training positive tweets
X1, y1 = utils.build_feature_matrix(train_pos_tweets, 1, 
                                    pos_freq_dict, neg_freq_dict)
# process training negative tweets
X2, y2 = utils.build_feature_matrix(train_neg_tweets, 0, 
                              pos_freq_dict, neg_freq_dict)
# Combine both training matrices to make a single training dataset
X_train = np.vstack((X1, X2))
y_train = np.vstack((y1, y2)).squeeze()

# process test positive tweets
X1, y1 = utils.build_feature_matrix(test_pos_tweets, 1, 
                                    pos_freq_dict, neg_freq_dict)
# process test negative tweets
X2, y2 = utils.build_feature_matrix(test_neg_tweets, 0, 
                              pos_freq_dict, neg_freq_dict)
# Combine both test matrices to make a single test dataset
X_test = np.vstack((X1, X2))
y_test = np.vstack((y1, y2)).squeeze()

In [202]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((8000, 3), (8000,), (2000, 3), (2000,))

In [203]:
print(X_train[:5])
print()
print(y_train[:5])

[[1.000e+00 3.904e+03 5.660e+02]
 [1.000e+00 8.640e+02 1.730e+02]
 [1.000e+00 4.036e+03 4.110e+02]
 [1.000e+00 3.120e+03 1.300e+02]
 [1.000e+00 3.038e+03 8.400e+01]]

[1. 1. 1. 1. 1.]


In [204]:
print(X_test[:5])
print()
print(y_test[:5])

[[1.000e+00 7.460e+02 1.360e+02]
 [1.000e+00 3.328e+03 3.320e+02]
 [1.000e+00 1.248e+03 3.400e+01]
 [1.000e+00 3.659e+03 5.910e+02]
 [1.000e+00 5.770e+02 2.200e+01]]

[1. 1. 1. 1. 1.]


## Logistics Regression

TODO: Brief about logistic regression

In [205]:
# Train logistic regression on training-set
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

LogisticRegression()

### Evaluate logistic regression

In [206]:
# Predict on test-set and view probabilities
pred_proba = lr_model.predict_proba(X_test)
print(lr_model.classes_)
print()
print(pred_proba[:10])

[0. 1.]

[[4.41640662e-03 9.95583593e-01]
 [1.52036161e-11 1.00000000e+00]
 [2.70140879e-05 9.99972986e-01]
 [1.11572973e-11 1.00000000e+00]
 [6.12470933e-03 9.93875291e-01]
 [2.25344147e-02 9.77465585e-01]
 [6.16168203e-03 9.93838318e-01]
 [2.88291613e-11 1.00000000e+00]
 [4.49604798e-11 1.00000000e+00]
 [1.23192567e-11 1.00000000e+00]]


Model's predict_proba method outputs probabilities of negative class (0) and positive class (1) 

In [207]:
# Predict on test-set and view predicted labels
pred_labels = lr_model.predict(X_test)
pred_labels[:10]

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

Model's predict method outputs predicted lables of each data point by looking at probabilities of postive class (1) with a default threshold of 0.5

In [208]:
# Measure performance as the overall accuracy of prediction
accuracy_score(y_test, pred_labels)

0.994

## Naive Bayes

TODO: Brief about Naive Bayes with formula and explanation

In [209]:
class NaiveBayesClassifier():
    def __init__(self, pos_freq_dict, neg_freq_dict):
        self.pos_freq_dict = pos_freq_dict
        self.neg_freq_dict = neg_freq_dict
        self.log_prior = 0
        self.logliklyhood_dict = {}
        
    def fit(self, train_tweets, y_train):
        # Compute prior and logprior...
        # Calculate total positive and negative tweets 
        D_pos = len(y_train[y_train == 1.0])
        D_neg = len(y_train[y_train == 0.0])
        D = D_pos + D_neg
        # Calculate probability of tweet being positive or negative
        P_pos = D_pos/D
        P_neg = D_neg/D
        # Calculate prior as ratio of positive and negative probabilities
        prior = P_pos/P_neg
        # Calculate the log prior
        self.log_prior = np.log(prior)
        # Compute liklyhood and logliklyhood for each word in our vocabulary...
        # Calculate number of unique words, this will be used in smoothing the probability
        # to make sure we do not get any zero probability value
        vocab = set(train_tweets)
        N_unique = len(vocab)
        # Claculate total positive and negative frequency
        N_pos = sum(pos_freq_dict.values())
        N_neg = sum(neg_freq_dict.values())
        # Calculate logliklyhood for each word in the vocab
        logliklyhood_dict = {}
        for word in vocab:
            pos_freq = pos_freq_dict.get(word, 0)
            neg_freq = neg_freq_dict.get(word, 0)
            pos_smooth_proba = (pos_freq + 1)/(N_pos + N_unique)    
            neg_smooth_proba = (neg_freq + 1)/(N_neg + N_unique)
            liklyhood = pos_smooth_proba/neg_smooth_proba
            logliklyhood = np.log(liklyhood)
            self.logliklyhood_dict[word] = logliklyhood
        
        
    def predict(self, X_test):
        y_pred = []
        for tweet in X_test:
            # Initial value is log prior
            log_liklyhood = self.log_prior
            for word in tweet:
                # sumup the log liklyhood of the each word
                log_liklyhood += self.logliklyhood_dict.get(word, 0)
            score = 1.0 if log_liklyhood > 0 else 0.0
            y_pred.append(score)
        return np.array(y_pred)

In [210]:
# For Naive Bayes training we need actual tweet tokens and not the training feature matrix (X_train)
# flatten the processed positive and negative tweets and combine them together
train_tweet_tokens = list(itertools.chain(*train_pos_tweets)) + list(itertools.chain(*train_neg_tweets))
# Train Naive Bayes model on training-set
nb_model = NaiveBayesClassifier(pos_freq_dict, neg_freq_dict)
nb_model.fit(train_tweet_tokens, y_train)

### Evaluate Naive Bayes

In [211]:
# For Naive Bayes testing we need actual tweet tokens and not the testing feature matrix (X_test)
test_tweets = test_pos_tweets + test_neg_tweets
# Predict on test-set and view predicted labels
pred_labels = nb_model.predict(test_tweets)

In [212]:
# Measure performance as the overall accuracy of prediction
accuracy_score(y_test, pred_labels)

0.9955

## Test models

In [213]:
# A madeup raw tweets
positive_test_tweet = '''
                     the movie http://abc_movie.com is a gem of movies
                     really liked it a lot!!! @abcmovie #awesome
                     '''
negative_test_tweet = '''
                     @_sssingh #bad this movie http://xyzmovie.com 
                     has to rank one of the worst in history of man kind
                     '''

### Logistic Regression

In [214]:
# Predict sentiment using logistic regression
print('Logistic Regression...')
print(negative_test_tweet)
sentiment = utils.get_sentiment(positive_test_tweet, lr_model)
print(f'Above tweet is {sentiment}')
print(negative_test_tweet)
sentiment = utils.get_sentiment(negative_test_tweet, lr_model)
print(f'Above tweet is {sentiment}')

Logistic Regression...

                     @_sssingh #bad this movie http://xyzmovie.com 
                     has to rank one of the worst in history of man kind
                     
Above tweet is POSITIVE

                     @_sssingh #bad this movie http://xyzmovie.com 
                     has to rank one of the worst in history of man kind
                     
Above tweet is NEGATIVE


### Naive Bayes

In [215]:
# Predict sentiment using Naive Bayes
print('Naive Bayes...')
print(positive_test_tweet)
sentiment = utils.get_sentiment(positive_test_tweet, nb_model, encode_tweet=False)
print(f'Above tweet is {sentiment}')
print(negative_test_tweet)
sentiment = utils.get_sentiment(negative_test_tweet, nb_model, encode_tweet=False)
print(f'Above tweet is {sentiment}')

Naive Bayes...

                     the movie http://abc_movie.com is a gem of movies
                     really liked it a lot!!! @abcmovie #awesome
                     
Above tweet is POSITIVE

                     @_sssingh #bad this movie http://xyzmovie.com 
                     has to rank one of the worst in history of man kind
                     
Above tweet is NEGATIVE
