# Aim:
* Extract features for logistic regression given some text
* Implement logistic regression from scratch
* Apply logistic regression on a natural language processing task
* Test logistic regression

We will be using a data set of tweets.

## Import functions and data

In [1]:
import nltk
from nltk.corpus import twitter_samples 
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [2]:

nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
import re
import string
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

In [4]:
#process_tweet(): cleans the text, tokenizes it into separate words, removes stopwords, and converts words to stems.
def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')

    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets


    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
      if(word not in stopwords_english and word not in string.punctuation):
        stem_word = stemmer.stem(word)
        tweets_clean.append(stem_word)
            #############################################################
            # 1 remove stopwords
            # 2 remove punctuation
            # 3 stemming word
            # 4 Add it to tweets_clean
    return tweets_clean

In [5]:
#build_freqs counts how often a word in the 'corpus' (the entire set of tweets) was associated with
  # a positive label '1'         or 
  # a negative label '0', 

#then builds the freqs dictionary, where each key is a (word,label) tuple, 

#and the value is the count of its frequency within the corpus of tweets.

def build_freqs(tweets, ys):
    """Build frequencies.
    Input:
        tweets: a list of tweets
        ys: an m x 1 array with the sentiment label of each tweet
            (either 0 or 1)
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its
        frequency
    """
    # Convert np array to list since zip needs an iterable.
    # The squeeze is necessary or the list ends up with one element.
    # Also note that this is just a NOP if ys is already a list.
    yslist = np.squeeze(ys).tolist()

    # Start with an empty dictionary and populate it by looping over all tweets
    # and over all processed words in each tweet.
    freqs = {}

    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            
            #############################################################
            #Update the count of pair if present, set it to 1 otherwise
            if pair in freqs:
              freqs[pair] += 1
            else:
              freqs[pair] = 1

    return freqs

### Prepare the data
* The `twitter_samples` contains subsets of 5,000 positive tweets, 5,000 negative tweets, and the full set of 10,000 tweets.  

In [6]:
# select the set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

* Train test split: 20% will be in the test set, and 80% in the training set.


In [7]:
# split the data into two pieces, one for training and one for testing
#############################################################
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]
train_x = train_pos + train_neg
test_x = test_pos + test_neg


* Create the numpy array of positive labels and negative labels.

In [8]:
# combine positive and negative labels
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

Final_data = all_positive_tweets+all_negative_tweets
data =np.append(np.ones((len(all_positive_tweets), 1)), np.zeros((len(all_negative_tweets), 1)), axis=0)
train_x,test_x,train_y,test_y = train_test_split(Final_data,data,test_size=0.25,random_state= 26)


* Create the frequency dictionary using the  `build_freqs()` function.  
    


In [9]:
# create frequency dictionary
#############################################################
freqs = build_freqs(train_x,train_y)

# check the output
print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

type(freqs) = <class 'dict'>
len(freqs) = 10899


* HERE, The `freqs` dictionary is the frequency dictionary that's being built. 
* The key is the tuple (word, label), such as ("happy",1) or ("happy",0).  The value stored for each key is the count of how many times the word "happy" was associated with a positive label, or how many times "happy" was associated with a negative label.

## Extracting the features

* Given a list of tweets, extract the features and store them in a matrix. You will extract two features.
    * The first feature is the number of positive words in a tweet.
    * The second feature is the number of negative words in a tweet. 
* Then train your logistic regression classifier on these features.
* Test the classifier on a validation set. 


In [26]:
def extract_features(tweet, freqs):
    '''
    Input: 
        tweet: a list of words for one tweet
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output: 
        x: a feature vector of dimension (1,3)
    '''
    # tokenizes, stems, and removes stopwords
    #############################################################
    word_l = process_tweet(tweet)
    
    # 3 elements in the form of a 1 x 3 vector
    x = np.zeros((1, 2)) 
    
    #bias term is set to 1
    x[0,0] = 1 
        
    # loop through each word in the list of words
    for word in word_l:
        
        # increment the word count for the positive label 1
        #############################################################
        if((word,1) in freqs):
          x[0,0]+=freqs[word,1]
        # increment the word count for the negative label 0
        #############################################################
          if((word,0) in freqs):
            x[0,1]+=freqs[word,0]
        
    
    assert(x.shape == (1, 2))
    return x[0]

In [34]:
def predict_tweet(tweet):
  with tf.Session() as sess:
      saver.restore(sess,save_path='TSession')
      data_i=[]
      for t in tweet:
        data_i.append(extract_features(t,freqs))
      data_i=np.asarray(data_i)
      return sess.run(tf.nn.sigmoid(tf.add(tf.matmul(a=data_i,b=W,transpose_b=True),bias)))
      print("--Fail--")
      return
  '''
    Input: 
        tweet: a string
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
        theta: (3,1) vector of weights
    Output: 
        y_pred: the probability of a tweet being positive or negative
    '''
    
    # extract the features of the tweet and store it into x
    #############################################################
    #x = extract_features(tweet,freqs)
    
    # make the prediction using x and theta
    #############################################################
    #y_pred = sigmoid(np.dot(x,theta))
    
    
    #return y_pred

In [35]:
bias=tf.Variable(np.random.randn(1),name="Bias")
W=tf.Variable(np.random.randn(1,2),name="Weight")

In [36]:
data=[]
for t in train_x:
  data.append(extract_features(t,freqs))
data=np.asarray(data)

In [37]:
Y_hat = tf.nn.sigmoid(tf.add(tf.matmul(np.asarray(data), W,transpose_b=True), bias)) 
ta=np.asarray(train_y)
Total_cost = tf.nn.sigmoid_cross_entropy_with_logits(logits = Y_hat, labels = ta) 
print(Total_cost)

Tensor("logistic_loss_2:0", shape=(7500, 1), dtype=float64)


In [38]:
# Gradient Descent Optimizer 
optimizer = tf.train.GradientDescentOptimizer(learning_rate = 0.00001 ,name="GradientDescent").minimize(Total_cost) 
# Global Variables Initializer 
init = tf.global_variables_initializer()

In [39]:
saver = tf.train.Saver()
with tf.Session() as sess:
  
  sess.run(init)
  print("Bias",sess.run(bias))
  print("Weight",sess.run(W))
  for epoch in range(1000):
    sess.run(optimizer)
    preds=sess.run(Y_hat)
    acc=((preds==ta).sum())/len(train_y)
    Accuracy=[]
    repoch=False
    if repoch:
      Accuracy.append(acc)
    if epoch % 1000 == 0:
      print("Accuracy",acc)
    saved_path = saver.save(sess, 'TSession')

Bias [-0.36965928]
Weight [[-1.09662566  1.64012829]]
Accuracy 0.0088


In [40]:
preds=predict_tweet(test_x)
print(preds,len(test_y))

INFO:tensorflow:Restoring parameters from TSession
[[1.00000000e+000]
 [0.00000000e+000]
 [2.62829261e-214]
 ...
 [0.00000000e+000]
 [0.00000000e+000]
 [0.00000000e+000]] 2500


In [41]:
def accuracy(x,y):
  return ((x==y).sum())/len(y)

In [42]:
print(accuracy(preds,test_y))

0.0088
