#Lab Assignment:

##Replace Manual version of Logistic Regression with TF based version. 
####[Reference : Lab-6]

## Import functions and data

In [1]:
import nltk
from nltk.corpus import twitter_samples 
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Downloading dataset from the library
nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
import re
import string
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

In [5]:
#process_tweet(): cleans the text, tokenizes it into separate words, removes stopwords, and converts words to stems.
def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet
    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')

    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets

    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
    tweets_clean = []
    for word in tweet_tokens:
            #############################################################
            # 1 remove stopwords
            # 2 remove punctuation
            # 3 stemming word
            # 4 Add it to tweets_clean
            if word not in stopwords_english and word not in string.punctuation:
                tweets_clean.append(stemmer.stem(word))

    return tweets_clean

In [6]:
#build_freqs counts how often a word in the 'corpus' (the entire set of tweets) was associated with
  # a positive label '1'         or 
  # a negative label '0', 
#then builds the freqs dictionary, where each key is a (word,label) tuple, 
#and the value is the count of its frequency within the corpus of tweets.

def build_freqs(tweets, ys):
    """Build frequencies.
    Input:
        tweets: a list of tweets
        ys: an m x 1 array with the sentiment label of each tweet
            (either 0 or 1)
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its
        frequency
    """
    # Convert np array to list since zip needs an iterable.
    # The squeeze is necessary or the list ends up with one element.
    # Also note that this is just a NOP if ys is already a list.
    yslist = np.squeeze(ys).tolist()

    # Start with an empty dictionary and populate it by looping over all tweets
    # and over all processed words in each tweet.
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)            
            #############################################################
            #Update the count of pair if present, set it to 1 otherwise
            freqs[pair] = freqs.get(pair, 0) + 1
    return freqs

### Prepare the data
* The `twitter_samples` contains subsets of 5,000 positive tweets, 5,000 negative tweets, and the full set of 10,000 tweets.  

In [7]:
# select the set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

* Train test split: 20% will be in the test set, and 80% in the training set.


In [8]:
# split the data into two pieces, one for training and one for testing
#############################################################
# Splitting it manually
train_pos = all_positive_tweets[:4000]
test_pos = all_positive_tweets[4000:]
train_neg = all_negative_tweets[:4000]
test_neg = all_negative_tweets[4000:]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

* Create the numpy array of positive labels and negative labels.

In [9]:
# combine positive and negative labels
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

* Create the frequency dictionary using the  `build_freqs()` function.  
    


In [10]:
# Example
print('This is an example of a positive tweet: \n', train_x[0])
print('\nThis is an example of the processed version of the tweet: \n', process_tweet(train_x[0]))

This is an example of a positive tweet: 
 #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)

This is an example of the processed version of the tweet: 
 ['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']


In [11]:
# create frequency dictionary
freqs = build_freqs(train_pos, train_y)

# check the output
print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))
print(freqs)

type(freqs) = <class 'dict'>
len(freqs) = 5735
{('followfriday', 1.0): 23, ('top', 1.0): 30, ('engag', 1.0): 7, ('member', 1.0): 14, ('commun', 1.0): 27, ('week', 1.0): 72, (':)', 1.0): 2847, ('hey', 1.0): 60, ('jame', 1.0): 7, ('odd', 1.0): 2, (':/', 1.0): 5, ('pleas', 1.0): 80, ('call', 1.0): 27, ('contact', 1.0): 4, ('centr', 1.0): 1, ('02392441234', 1.0): 1, ('abl', 1.0): 6, ('assist', 1.0): 1, ('mani', 1.0): 28, ('thank', 1.0): 504, ('listen', 1.0): 14, ('last', 1.0): 39, ('night', 1.0): 55, ('bleed', 1.0): 2, ('amaz', 1.0): 41, ('track', 1.0): 5, ('scotland', 1.0): 2, ('congrat', 1.0): 15, ('yeaaah', 1.0): 1, ('yipppi', 1.0): 1, ('accnt', 1.0): 2, ('verifi', 1.0): 2, ('rqst', 1.0): 1, ('succeed', 1.0): 1, ('got', 1.0): 57, ('blue', 1.0): 8, ('tick', 1.0): 1, ('mark', 1.0): 1, ('fb', 1.0): 4, ('profil', 1.0): 2, ('15', 1.0): 4, ('day', 1.0): 187, ('one', 1.0): 90, ('irresist', 1.0): 2, ('flipkartfashionfriday', 1.0): 16, ('like', 1.0): 187, ('keep', 1.0): 55, ('love', 1.0): 336, (

In [12]:
def extract_features(tweet, freqs):
    '''
    Input: 
        tweet: a list of words for one tweet
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output: 
        x: a feature vector of dimension (1,3)
    '''
    # tokenizes, stems, and removes stopwords
    word_l = process_tweet(tweet)
    
    # 3 elements in the form of a 1 x 3 vector
    x = np.zeros((1, 3)) 
    
    #bias term is set to 1
    x[0,0] = 1
        
    # loop through each word in the list of words
    for word in word_l:
        # increment the word count for the positive label 1
        x[0,1] += freqs.get((word, 1), 0) 
        # increment the word count for the negative label 0
        x[0,2] += freqs.get((word, 0), 0)
    assert(x.shape == (1, 3))
    return x

In [13]:
for i in range(len(train_x)):
  train_x[i] = extract_features(train_x[i], freqs)

In [14]:
for i in range(len(test_x)):
  test_x[i] = extract_features(test_x[i], freqs)

In [16]:
import tensorflow as tf

train_x = tf.cast(train_x, tf.float32)
test_x = tf.cast(test_x, tf.float32)

##Tensorflow implementation of Logistic Regression

In [17]:
# Import the libraries
from __future__ import absolute_import, division, print_function
import tensorflow as tf
import numpy as np

In [18]:
# Tweets dataset parameters.
num_classes = 2
num_features = 3

# Training parameters.
learning_rate = 0.01
training_steps = 1000
batch_size = 200
display_step = 50

In [19]:
# Use tf.data API to shuffle and batch data.
train_data = tf.data.Dataset.from_tensor_slices((train_x, train_y))
train_data = train_data.repeat().shuffle(8000).batch(batch_size).prefetch(1)

In [20]:
# Weight of shape [784, 10], the 28*28 image features, and a total number of classes.
w = tf.Variable(tf.ones([num_features, num_classes]), name="weight")
# Bias of shape [10], the total number of classes.
b = tf.Variable(tf.zeros([num_classes]), name="bias")

In [21]:
# Logistic regression (Wx + b).
def logistic_regression(x):
  # Apply softmax to normalize the logits to a probability distribution.
  # x = tf.cast(x, tf.float32)
  return tf.nn.softmax(tf.matmul(x, w) + b)

# Cross-Entropy loss function.
def cross_entropy(y_pred, y_true):
  # Encode label to a one hot vector.
  # y_true = tf.one_hot(y_true, depth=num_classes)
  # Clip prediction values to avoid log(0) error.
  y_pred = tf.clip_by_value(y_pred, 1e-7, 1.)
  # Compute cross-entropy.
  return tf.reduce_mean(-tf.reduce_sum(y_true * tf.math.log(y_pred)))

In [22]:
# Accuracy metric.
def accuracy(y_pred, y_true):
  # Predicted class is the index of the highest score in prediction vector (i.e. argmax).
  correct = tf.equal(tf.argmax(y_pred, 1), tf.cast(y_true, tf.int64))
  return tf.reduce_mean(tf.cast(correct, tf.float32))

# SGD Optimizer
optimizer = tf.optimizers.SGD(learning_rate)

In [23]:
# Optimization process. 

def run_optimization(x, y):
  # Wrap computation inside a GradientTape for automatic differentiation.
  with tf.GradientTape() as g:
    pred = logistic_regression(x)
    loss = cross_entropy(pred, y)
  # Compute gradients.
  gradients = g.gradient(loss, [w, b])
  # Update W and b following gradients.
  optimizer.apply_gradients(zip(gradients, [w, b]))

In [24]:
# Run training for the given number of steps.
for step, (batch_x, batch_y) in enumerate(train_data.take(training_steps), 1):
  # Run the optimization to update W and b values.
  batch_x = tf.cast(batch_x, tf.float32)
  batch_y = tf.cast(batch_y, tf.float32)
  run_optimization(batch_x, batch_y)
  if step % display_step == 0:
    #Obtain Predictions       
    pred = logistic_regression(batch_x)
    #Ccompute loss
    loss = cross_entropy(pred, batch_y)
    #Compute Accuracy
    acc = accuracy(pred, batch_y)
    #print accuracy
    print("step : %i, loss: %f, accuracy: %f" % (step, loss, acc))

step : 50, loss: 31607.468750, accuracy: 0.430000
step : 100, loss: 35211.828125, accuracy: 0.365000
step : 150, loss: 27171.335938, accuracy: 0.510000
step : 200, loss: 19962.619141, accuracy: 0.640000
step : 250, loss: 26894.078125, accuracy: 0.515000
step : 300, loss: 33548.281250, accuracy: 0.395000
step : 350, loss: 23844.234375, accuracy: 0.570000
step : 400, loss: 19962.619141, accuracy: 0.640000
step : 450, loss: 27448.593750, accuracy: 0.505000
step : 500, loss: 37152.636719, accuracy: 0.330000
step : 550, loss: 25785.042969, accuracy: 0.535000
step : 600, loss: 22180.685547, accuracy: 0.600000
step : 650, loss: 28003.109375, accuracy: 0.495000
step : 700, loss: 34934.570312, accuracy: 0.370000
step : 750, loss: 28557.625000, accuracy: 0.485000
step : 800, loss: 22180.685547, accuracy: 0.600000
step : 850, loss: 26616.818359, accuracy: 0.520000
step : 900, loss: 34657.312500, accuracy: 0.375000
step : 950, loss: 27725.851562, accuracy: 0.500000
step : 1000, loss: 22735.203125,

In [25]:
# Test model on validation set.
pred = logistic_regression(test_x)
print("Test accuracy : %f" % accuracy(pred, test_y))

Test accuracy : 0.500000
