# **Sentiment Analysis**: NLP, Text Embedding, and RNN

Source:  [https://github.com/d-insight/code-bank.git](https://github.com/d-insight/code-bank.git)  
License: [MIT License](https://opensource.org/licenses/MIT). See open source [license](LICENSE) in the Code Bank repository. 

-------------

## Overview

Sentiment analysis is a challenging subject in machine learning. People express their emotions in a way that can be very ambiguous for both humans and computers. In this demo, we analyze sentiments of a set of IMDB movie reviews. The dataset consists of review texts as well as a binary sentiment label (1: positive, 0: negative). 

<img src="http://barnraisersllc.com/wp-content/uploads/2017/01/Sentiment-Analysis.jpg" width="500" height="500" align="center"/>

Image source: http://barnraisersllc.com/wp-content/uploads/2017/01/Sentiment-Analysis.jpg

Dataset source: *Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts. (2011). "Learning Word Vectors for Sentiment Analysis." The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011).*

-------------

## **Part 0**: Setup

In [None]:
# Put all import statements at the top of your notebook

# Standard imports
import pandas as pd
import numpy  as np
from bs4 import BeautifulSoup 
import re

# Data science packages
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection         import KFold, GridSearchCV, cross_val_score
from sklearn.ensemble                import RandomForestClassifier
from sklearn.pipeline                import Pipeline
from sklearn.model_selection         import train_test_split

# Neural networks
from tensorflow.keras.models                 import Sequential, load_model
from tensorflow.keras.layers                 import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.preprocessing.text     import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.utils.vis_utils import model_to_dot

# Text processing packages
import nltk
import nltk.data
from nltk.corpus import stopwords 
from nltk.stem   import SnowballStemmer
import gensim
from gensim.models import doc2vec, word2vec

# Visualization packages
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from IPython.display import SVG

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

%matplotlib inline


In [None]:
# Set constants 

# Set a seed for replication
SEED = 10

# Set performance metric
SCORE = 'accuracy'

In [None]:
# Nested cross validation helper function
def nested_cv(X, y, est_pipe, p_grid, p_score, n_splits_inner = 3, n_splits_outer = 3, n_cores = 1, seed = 0):

    # Cross-validation schema for inner and outer loops (stratified if it is a classification)
    inner_cv = KFold(n_splits = n_splits_inner, shuffle = True, random_state = seed)
    outer_cv = KFold(n_splits = n_splits_outer, shuffle = True, random_state = seed)
    
    # Grid search to tune hyper parameters
    est = GridSearchCV(estimator = est_pipe, param_grid = p_grid, cv = inner_cv, scoring = p_score, n_jobs = n_cores)

    # Nested CV with parameter optimization
    nested_scores = cross_val_score(estimator = est, X = X, y = y, cv = outer_cv, scoring = p_score, n_jobs = n_cores)
    
    print('Average score: %0.4f (+/- %0.4f)' % (nested_scores.mean(), nested_scores.std() * 1.96))
    
    return nested_scores.mean(), nested_scores.std() * 1.96

# Define a function to split a review into clean list of words
def review_to_wordlist(review):
    
    review_text = BeautifulSoup(review).get_text()
   
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    
    words = review_text.lower().split()
    
    stops = set(stopwords.words("english"))
    words = [w for w in words if not w in stops]
    
    stemmer = SnowballStemmer('english')
    words = [stemmer.stem(w) for w in words]
    
    return(words)

# Define a function to split a review into parsed sentences, where each sentence is a word list
def review_to_sentences(review, tokenizer):
    
    raw_sentences = tokenizer.tokenize(review.strip())  
    sentences = []
    for raw_sentence in raw_sentences:      
        if len(raw_sentence) > 0:           
            sentences.append( review_to_wordlist( raw_sentence ))
   
    return sentences

# Function to average all of the word vectors in a given paragraph
def makeFeatureVec(words, model, num_features):
    
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,), dtype='float32')
    nwords = 0.
     
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. convert it to a set, for speed 
    index2word_set = set(model.wv.index2word)
    
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec,nwords)
    return featureVec

# Given a set of reviews (each one a list of words), calculate 
# the average feature vector for each one and return a 2D numpy array
def getAvgFeatureVecs(reviews, model, num_features):
    
    # Initialize a counter
    counter = 0
    
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype='float32')
     
    # Loop through the reviews
    for review in reviews:
       
       # Print a status message every 1000th review
        if counter%1000 == 0:
            print ('Review %d of %d' % (counter, len(reviews)))
       
        # Call the function (defined above) that makes average feature vectors
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
       
        # Increment the counter
        counter = counter + 1
        
    return reviewFeatureVecs

## **Part 1**: Data Preprocessing and EDA

In [None]:
# Load data
data = pd.read_csv('sentiment_data.tsv', header=0, delimiter='\t', quoting=3)

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.columns.values

In [None]:
# A sample observation
print ('id: \t\t', data['id'][0])
print ('sentiment: \t', data['sentiment'][0], '\n')
print (data['review'][0])

In [None]:
# Distribution of target
pd.DataFrame.hist(data,column='sentiment')

In [None]:
# Visualize word distributions using the word cloud library

# Concatenate all rows of review column
text = data['review'].str.cat(sep=' ')

# Generate a word cloud image
wordcloud = WordCloud(width=1600, height=800).generate(text)

# Display the generated image
wordcloud = WordCloud(max_font_size=60).generate(text)
plt.figure(figsize=(20, 10))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [None]:
# Check for missing values and drop corresponding rows, if any
data.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)
print(data.shape)

In [None]:
# Remove HTML tags 
data_no_html = BeautifulSoup(data['review'][0])  

print('Original:\n{}\n'.format(data['review'][0]))
print('Without HTML tags:\n{}\n'.format(data_no_html.get_text()))

In [None]:
# Keep only alphabetical terms
data_no_digits = re.sub('[^a-zA-Z]', ' ', data_no_html.get_text())  
print('No digits:\n{}'.format(data_no_digits))

In [None]:
# Lower case and separate into tokens
lower_case = data_no_digits.lower()        
words = lower_case.split(' ')
print('Lower-cased and token-separated:\n{}'.format(words))

In [None]:
# Download the stop words dataset of NLTK library
nltk.download('stopwords')

In [None]:
# Remove stop words
print('Stop words in NLTK:\n{}\n'.format(stopwords.words('english')))
words_no_stop = [w for w in words if not w in stopwords.words('english')]
print('Without stop words:\n{}'.format(words_no_stop))

In [None]:
# Stem words

stemmer = SnowballStemmer('english')

print('grows  --> {}'.format(stemmer.stem('grows')))
print('leaves --> {}'.format(stemmer.stem('leaves')))
print('fairly --> {}\n'.format(stemmer.stem('fairly')))

words_stemmed = [stemmer.stem(w) for w in words_no_stop]
print('Stemmed:\n{}'.format(words_stemmed))

In [None]:
# Function to convert a raw review to a string of words: 
# the input is a single string (a raw movie review) and
# the output is a single string (a preprocessed movie review)

def review_to_words(raw_review):

    # Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    
    # Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    
    # Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    
    # In Python, searching a set is much faster than searching a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
     
    # Remove stop words
    meaningful_words = [w for w in words if not w in stops]
    
    # Stem words
    stem_words = [stemmer.stem(w) for w in meaningful_words]
    
    # Join the words back into one string separated by space and return the result
    return( " ".join( stem_words )) 

In [None]:
# An example of text preprocessing

clean_review = review_to_words(data['review'][0])

print('Original:\n{}\n'.format(data['review'][0]))
print('Cleaned:\n{}'.format(clean_review))

In [None]:
# Clean and parse all movie reviews
num_reviews = data['review'].size
clean_data_reviews = []

for i in range(0, num_reviews):
    clean_data_reviews.append(review_to_words(data['review'][i]))
    
    # If the index is evenly divisible by 1000, print a message to show progress
    if (i+1) % 1000 == 0:
        print('Review {} of {}\t{}%'.format(i+1, num_reviews, round((i+1)/num_reviews*100, 0)))
    
assert len(clean_data_reviews) == len(data), 'Error: the number of cleaned reviews does not match.'

## **Part 2**: Feature Extraction using TFIDF

In [None]:
# Transforming pre-processed reviews to bag of words (BOW) feature representation
vectorizer = TfidfVectorizer(max_features = 5000)
features = vectorizer.fit_transform(clean_data_reviews)
features = features.toarray()
print (features.shape)

In [None]:
# Get feature names (each term is a feature)
vocab = vectorizer.get_feature_names()
print (vocab[0])

In [None]:
# Print the first 10 vocabulary words and their corresponding TFIDF values

dist = np.sum(features, axis=0)
i = 0
for tag, count in zip(vocab, dist):
    print(tag.ljust(15), count)
    i = i + 1
    if i > 10 : 
        break

In [None]:
# See the values of feature 0
features[0]

## **Part 3**: Classification using Random Forest with TFIDF Features

In [None]:
# Define the target

target = data['sentiment']

In [None]:
%%time

# Define pipeline
estimators = []
estimators.append(('rf_clf', RandomForestClassifier()))
rf_pipe = Pipeline(estimators)
rf_pipe.set_params(rf_clf__random_state = SEED)

# Setup possible values of parameters to optimize over
p_grid = {"rf_clf__n_estimators": [int(i) for i in np.linspace(10.0, 50.0, 5)]}

acc_rf, std_rf = nested_cv(X = features, y = target, est_pipe = rf_pipe, p_grid = p_grid, p_score = SCORE, n_cores = -1)

## **Part 4**: Word Vectors - Word2Vec

A popular vectorization method for words is a technique known as Word2Vec, which is implemented in the `gensim` library. In Word2Vec each word is assigned a low-dimensional vector which is learnt under the assumption that words that are close to each other in a document are semantically related. Word2Vec can be used as a base for vectorize documents in low dimensions. You can read more about it here: https://cs224d.stanford.edu/lectures/CS224d-Lecture2.pdf.

In [None]:
# Download the punkt tokenizer for sentence splitting
nltk.download('punkt')

# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [None]:
# Extract all sentences from all reviews
sentences = []

for review in data['review']:
    sentences += review_to_sentences(review, tokenizer)

In [None]:
# Check number of sentences
print (len(sentences))

In [None]:
# Check the first sentence
print (sentences[0])

In [None]:
%%time
# Train word vectors

# Set values for various parameters
num_features   = 300  # word vector dimensionality                      
min_word_count = 40   # minimum word count                        
num_workers    = 16   # number of threads to run in parallel
context        = 10   # context window size                                                                                    

# Initialize and train the model 
print ('Training model...')
w2v_model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context)
print('Done !')

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient
w2v_model.init_sims(replace=True)
 
# Save the model for later use. you can load it later using Word2Vec.load()
# model_name = 'w2v_imdb'
# model.save(model_name)

In [None]:
# Identify the word which is less similar to other words in a set
w2v_model.doesnt_match("man woman child kitchen".split())

In [None]:
w2v_model.doesnt_match("france england germany berlin".split())

In [None]:
# Identify the words which are similar to a focal word

w2v_model.most_similar("man")

In [None]:
w2v_model.most_similar("queen")

In [None]:
w2v_model.most_similar("bad")

In [None]:
# Corresponding word vector of an example word

w2v_model['flower']

### Paragraph vectors - Average Word Vectors

Even though the word vectors show good semantical properties, using them to get the same sort of properties from sentences is not straight forward. The simplest solution is to average word vectors of a document to come up with the same dimensional paragraph vector.

In [None]:
# Calculate average feature vectors for review data, using the functions we defined above.
clean_data_reviews = []
for review in data['review']:
    clean_data_reviews.append( review_to_wordlist( review ))

w2v_features = getAvgFeatureVecs(clean_data_reviews, w2v_model, num_features)

### Classification using Random Forest

In [None]:
%%time

# Define pipeline
estimators = []
estimators.append(('rf_clf', RandomForestClassifier()))
rf_pipe = Pipeline(estimators)
rf_pipe.set_params(rf_clf__random_state = SEED)

# Setup possible values of parameters to optimize over
p_grid = {"rf_clf__n_estimators": [int(i) for i in np.linspace(10.0, 50.0, 5)]}

acc_rfW2V, std_rfW2V = nested_cv(X = w2v_features, y = target, est_pipe = rf_pipe, p_grid = p_grid, p_score = SCORE, n_cores = -1)

## **Part 5**: Document Vectors - Doc2Vec

Another solution to obtain review vectors is to obtain them directly. Paragraph Vectors are treating each document as a word itself and obtains their vectors directly. You can read more about it here: https://cs.stanford.edu/~quocle/paragraph_vector.pdf . Again, the `gensim` library implements this method in the `doc2vec` class.

In [None]:
# Doc2Vec needs each review to be tagged with some sort of ids
# Here we tag each review with the 'id' field

tagged_clean_data_reviews = []
for uid, review in zip(data['id'], clean_data_reviews):
    tagged_clean_data_reviews.append(gensim.models.doc2vec.TaggedDocument(words=review, tags=['%s' % uid[1:-1]]))

In [None]:
# An example of tagging

tagged_clean_data_reviews[0]

In [None]:
%%time
# Train paragraph vectors

# Set values for various parameters
num_features   = 300  # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers    = 4    # Number of threads to run in parallel
context        = 10   # Context window size                                                                                    

# Initialize and train the model (this will take some time)
print('Training model...')
d2v_model = doc2vec.Doc2Vec(tagged_clean_data_reviews, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context)
print('Done !')

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
d2v_model.init_sims(replace=True)
 
# Save the model for later use. You can load it later using Doc2Vec.load().
# d2v_model_name = 'w2v_imdb'
# d2v_model.save(d2v_model_name)

In [None]:
# Corresponding document vector of a review given its id
d2v_model.docvecs['5814_8']

In [None]:
# Create a set of D2V features of all reviews to be used in down-stream predictions 
d2v_features = d2v_model.docvecs
d2v_features[0]

In [None]:
# Report the similarity of two reviews (based on cosine similarity) 
d2v_model.docvecs.similarity(d1='7759_3', d2='5814_8')

### Classification using Random Forest

In [None]:
%%time

# Define pipeline
estimators = []
estimators.append(('rf_clf', RandomForestClassifier()))
rf_pipe = Pipeline(estimators)
rf_pipe.set_params(rf_clf__random_state = SEED)

# Setup possible values of parameters to optimize over
p_grid = {"rf_clf__n_estimators": [int(i) for i in np.linspace(10.0, 50.0, 5)]}

acc_rfD2V, std_rfD2V = nested_cv(X = d2v_features, y = target, est_pipe = rf_pipe, p_grid = p_grid, p_score = SCORE, n_cores = -1)

## **Part 6**: Classification using a type of Recurrent Neural Network (RNN), the LSTM

Long Short-Term Memory (LSTM) networks are a particular type of RNN optimized for learning long-term dependencies in sequential data such as text. 

In [None]:
# Vectorize features into same size vectors, by assigning a numerical id to each term

max_features = 5000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(clean_data_reviews)
X = tokenizer.texts_to_sequences(clean_data_reviews)
X = pad_sequences(X)

# One-hot encoding of target
Y = pd.get_dummies(data['sentiment']).values

In [None]:
print('X shape: {}'.format(X.shape))
print('Y shape: {}'.format(Y.shape))

In [None]:
# Build recurrent neural network model. Architecture is taken from
# https://www.kaggle.com/ngyptr/lstm-sentiment-analysis-keras

embed_dim = 128
lstm_out = 196

def rnn_model():
    
    model = Sequential()
    model.add(Embedding(max_features, embed_dim, input_length = X.shape[1]))
    model.add(SpatialDropout1D(0.4))
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
    
    return(model)

model = rnn_model()
model.summary()

In [None]:
SVG(model_to_dot(model,show_shapes=True).create(prog='dot', format='svg'))

In [None]:
# Split data into train and test sets

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = SEED, stratify = Y)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

In [None]:
%%time
# Train the model (needs approx. 1 hour)
batch_size = 100
epochs     = 5 
TRAIN      = False

if TRAIN:
    model.fit(X_train, Y_train, epochs = epochs, batch_size = batch_size)
    model.save('rnn.h5')
else:
    model = load_model('rnn.h5')

In [None]:
# Evaluate model performance in terms of loss and accuracy
score, acc_RNN = model.evaluate(X_test, Y_test, batch_size = batch_size)
print("Loss: %.4f" % (score))
print("Accuracy: %.4f" % (acc_RNN))

In [None]:
max_features = 5000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(clean_data_reviews)

In [None]:
# Assess sentiment of a new review
rvw = ['a good movie but not excellent.']

# Vectorizing the review
rvw = tokenizer.texts_to_sequences(rvw)

# Padding the tweet to have exactly the same shape as `embedding_2` input
rvw = pad_sequences(rvw, maxlen=1105, dtype='int32', value=0)

# Predict sentiment of the review
sentiment = model.predict(rvw, batch_size=1, verbose = 2)[0]

print()
print('NEGATIVE - POSITIVE')
print(sentiment)
print()

if (np.argmax(sentiment) == 0):
    print("negative")
    
elif (np.argmax(sentiment) == 1):
    print("positive")

## **SUMMARY OF ACCURACY SCORES**

In [None]:
width       = 50
models      = ['Baseline', 'Random Forest', 'Random Forest + word2vec', 'Random Forest + doc2vec', 'RNN']
result_acc  = [0.5, acc_rf, acc_rfW2V, acc_rfD2V, acc_RNN]
result_std  = [0,   std_rf, std_rfW2V, std_rfD2V, np.nan]

print('', '=' * width, '\n', 'Summary of Accuracy Scores'.center(width), '\n', '=' * width)  
for i in range(len(models)):
    print(models[i].center(width-18), '{0:.4f}'.format(result_acc[i]), '+/-{0:.4f}'.format(result_std[i]))

## **Part 6**: Discussion

- Considering TFIDF representation, why cosine distance can be a better measure for similarity of documents compared to euclidean distance?
- Despite good performance of W2V vectors to detect similarity and relationship between two words, why similar methods are not as powerful when it comes to detect similarity and relationships between two documents? 