## Logistic Regression and Naive Bayes for NLP analysis

In [1]:
import nltk
nltk.download('twitter_samples')
nltk.download('stopwords')
import re
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from os import getcwd
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import twitter_samples 

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\Sealion\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sealion\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def process_tweet(tweet):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    tweet = re.sub(r'\$\w*', '', tweet)
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and    
                word not in string.punctuation):  
            stem_word = stemmer.stem(word)       
            tweets_clean.append(stem_word)
    return tweets_clean

In [3]:
def build_freqs(tweets, ys):
    # Convert np array to list since zip.
    yslist = np.squeeze(ys).tolist()
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1
    return freqs

In [4]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')
print(len(all_positive_tweets), len(all_negative_tweets))

5000 5000


In [5]:
test_pos = all_positive_tweets[3500:]
train_pos = all_positive_tweets[:3500]
test_neg = all_negative_tweets[3500:]
train_neg = all_negative_tweets[:3500]
train_x = train_pos + train_neg 
test_x = test_pos + test_neg
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)
print("train_y.shape = " + str(train_y.shape))
print("test_y.shape = " + str(test_y.shape))
freqs = build_freqs(train_x, train_y)

train_y.shape = (7000, 1)
test_y.shape = (3000, 1)


## (1). Logistic Regression model

In [6]:
def sigmoid(z): 
    h = 1 / (1 + np.exp(-z))
    return h

In [7]:
def gradientDescent(x, y, theta, alpha, num_iters):
    m = x.shape[0]    
    for i in range(0, num_iters):
        z = np.dot(x,theta)
        h = sigmoid(z)        
        # calculate the cost function
        J = -1./m * (np.dot(y.transpose(), np.log(h)) + np.dot((1-y).transpose(),np.log(1-h)))    
        # update the weights theta
        theta = theta = theta - (alpha/m) * np.dot(x.transpose(),(h-y))
    J = float(J)
    return J, theta

In [8]:
def extract_features(tweet, freqs):
    # process_tweet tokenizes, stems, and removes stopwords
    word_l = process_tweet(tweet)
    x = np.zeros((1, 3))   
    #bias term is set to 1
    x[0,0] = 1 
    for word in word_l:        
        # count for the positive label 1
        x[0,1] += freqs.get((word, 1.0),0)        
        # count for the negative label 0
        x[0,2] += freqs.get((word, 0.0),0)
    assert(x.shape == (1, 3))
    return x

In [11]:
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :]= extract_features(train_x[i], freqs)
Y = train_y
# Apply gradient descent
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-9, 1000)
print(f"The cost after training is {J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

The cost after training is 0.33243085.
The resulting vector of weights is [4e-08, 0.00037801, -0.00044195]


In [12]:
def predict_tweet(tweet, freqs, theta):
    x = extract_features(tweet,freqs)    
    y_pred = sigmoid(np.dot(x,theta))
    return y_pred

In [13]:
def test_logistic_regression(test_x, test_y, freqs, theta):
    y_hat = []    
    for tweet in test_x:
        y_pred = predict_tweet(tweet, freqs, theta)        
        if y_pred > 0.5:
            y_hat.append(1)
        else:
            y_hat.append(0)
    accuracy = (y_hat==np.squeeze(test_y)).sum()/len(test_x)
    return accuracy

In [14]:
tmp_accuracy = test_logistic_regression(test_x, test_y, freqs, theta)
print(f"Logistic regression's accuracy = {tmp_accuracy:.4f}")

Logistic regression's accuracy = 0.9910


In [16]:
print('Label Predicted Tweet')
for x,y in zip(test_x,test_y):
    y_hat = predict_tweet(x, freqs, theta)
    if np.abs(y - (y_hat > 0.5)) > 0:
        print('THE TWEET IS:', x)
        print('THE PROCESSED TWEET IS:', process_tweet(x))
        print('%d\t%0.8f\t%s' % (y, y_hat, ' '.join(process_tweet(x)).encode('ascii', 'ignore')))

Label Predicted Tweet
THE TWEET IS: Dare I say that there is a video of me on our About Page - http://t.co/1nXM8mxmbu - talking infrared heating :-)
THE PROCESSED TWEET IS: ['dare', 'say', 'video', 'page']
1	0.49932441	b'dare say video page'
THE TWEET IS: @Th0tiana_ fruity vodkas always help : )
THE PROCESSED TWEET IS: ['fruiti', 'vodka', 'alway', 'help']
1	0.49835917	b'fruiti vodka alway help'
THE TWEET IS: Where's the time going?! ONLY 40 days to go &amp; sooo much to do!! Not enough days in the week :p #goodmorning http://t.co/4NPwOGr9QL
THE PROCESSED TWEET IS: ["where'", 'time', 'go', '40', 'day', 'go', 'sooo', 'much', 'enough', 'day', 'week', ':p', 'goodmorn']
1	0.49456871	b"where' time go 40 day go sooo much enough day week :p goodmorn"
THE TWEET IS: I ATE YOUR LAST COOKIE SHIR0 &gt;:D
THE PROCESSED TWEET IS: ['ate', 'last', 'cooki', 'shir', '0', '>:d']
1	0.49718284	b'ate last cooki shir 0 >:d'
THE TWEET IS: @hesaffection are you the owner of the user @hesIovely ? : )
THE PROCESS

##  (2). Naive Bayes models

In [17]:
# get the sets of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')
# split the data
test_pos = all_positive_tweets[3500:]
train_pos = all_positive_tweets[:3500]
test_neg = all_negative_tweets[3500:]
train_neg = all_negative_tweets[:3500]
train_x = train_pos + train_neg
test_x = test_pos + test_neg
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

In [18]:
def count_tweets(result, tweets, ys):
    for y, tweet in zip(ys, tweets):
        for word in process_tweet(tweet):
            pair = (word,y)
            if pair in result:
                result[pair] += 1
            else:
                result[pair] = 1
    return result

In [19]:
def train_naive_bayes(freqs, train_x, train_y):
    loglikelihood = {}
    logprior = 0
    # calculate V, the number of unique words in the vocabulary
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)
    N_pos = N_neg = 0
    for pair in freqs.keys():
        # if the label is positive (greater than zero)
        if pair[1] > 0:
            N_pos += freqs[pair]
        else:
            N_neg += freqs[pair]
    D = len(train_y)
    D_pos = (len(list(filter(lambda x: x > 0, train_y))))
    D_neg = (len(list(filter(lambda x: x <= 0, train_y))))
    # Calculate logprior
    logprior = np.log(D_pos) - np.log(D_neg)

    for word in vocab:
        freq_pos = lookup(freqs,word,1)
        freq_neg = lookup(freqs,word,0)
        # computer the probability
        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)

        # calculate the log likelihood of the word
        loglikelihood[word] = np.log(p_w_pos/p_w_neg)
    return logprior, loglikelihood

In [20]:
def lookup(freqs, word, label):
    n = 0  
    pair = (word, label)
    if (pair in freqs):
        n = freqs[pair]
    return n

In [21]:
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)
print(logprior)
print(len(loglikelihood))

0.0
8304


In [22]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
    word_l = process_tweet(tweet)
    # initialize 
    p = 0
    p += logprior
    for word in word_l:
        if word in loglikelihood:
            p += loglikelihood[word]
    return p

In [23]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
    accuracy = 0  
    y_hats = []
    for tweet in test_x:
        if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
            y_hat_i = 1
        else:
            y_hat_i = 0
        y_hats.append(y_hat_i)
    error = np.mean(np.absolute(y_hats-test_y))
    accuracy = 1-error
    return accuracy

In [24]:
print("Naive Bayes accuracy = %0.4f" %
      (test_naive_bayes(test_x, test_y, logprior, loglikelihood)))

Naive Bayes accuracy = 0.9920


In [25]:
# Error Analysis
print('Truth Predicted Tweet')
for x, y in zip(test_x, test_y):
    y_hat = naive_bayes_predict(x, logprior, loglikelihood)
    if y != (np.sign(y_hat) > 0):
        print('%d\t%0.2f\t%s' % (y, np.sign(y_hat) > 0, ' '.join(
            process_tweet(x)).encode('ascii', 'ignore')))

Truth Predicted Tweet
1	0.00	b'fruiti vodka alway help'
1	0.00	b'ate last cooki shir 0 >:d'
1	0.00	b"say want luci gone soon lisa oh that' horribl sorri :-) ye back soon pleas x"
1	0.00	b''
1	0.00	b''
1	0.00	b'truli later move know queen bee upward bound movingonup'
1	0.00	b'movi key life japanes version'
1	0.00	b'new report talk burn calori cold work harder warm feel better weather :p'
1	0.00	b'harri niall 94 harri born ik stupid wanna chang :D'
1	0.00	b''
1	0.00	b''
1	0.00	b'park get sunlight'
1	0.00	b'uff itna miss karhi thi ap :p'
0	1.00	b'rohingya muslim 72 indict human traffick charg thailand asia around ...'
0	1.00	b'screenshot'
0	1.00	b' love spot robinhood wild thank support robinhoodi  '
0	1.00	b'look good pal glad paid  111 jersey gilet'
0	1.00	b'biodivers taxonom infrastructur intern collabor new speci discoveri'
0	1.00	b'srsli fuck u unfollow hope ur futur child unpar u >:-('
0	1.00	b'bianca ur one bun'
0	1.00	b'hello info possibl interest jonatha close join beti :( great'