<a href="https://colab.research.google.com/github/fork52/Sentiment-Analyzer/blob/master/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##### Copyright 2020 Yash Bafna

In [None]:
#@title MIT License
#
# Copyright (c) 2020 Yash Bafna
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.

## Download the dataset

In [None]:
!wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Apps_for_Android_5.json.gz

## Import Necessary Libraries


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
import gzip
from sklearn.model_selection import train_test_split
import tensorflow as tf

## Load the data in a dataframe

In [None]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

# load the dataframe in the variable df
df = getDF('reviews_Apps_for_Android_5.json.gz')
print( 'No of records:' , len(df) )

In [None]:
df.head()

## Taking a look at a few reviews.
Change the index to see different reviews and their ratings.

In [None]:
index = 41170 # Feel free to change the index
print('Review:  ', df.iloc[index]['reviewText'])
print('Summary:',df.iloc[index]['summary'])
print('Ratings:',df.iloc[index]['overall'] )

## Get Sentences and Get Labels


In [None]:
from pprint import pprint
def get_data(df):
    sents = df['reviewText'].tolist()
    ratings = list( map(lambda x:int(x)-1 ,df['overall'].tolist()) )
    combined_data = list( zip( *[ sents , ratings]) )
    combined_data = list( filter( lambda x : isinstance(x[0],str) ,combined_data) )
    return list(zip(*combined_data))

# Separate sentences and ratings
fsentences ,fratings = get_data(df)

## Train-Test split

In [None]:
# Split dataset in 
X_train, X_test, y_train, y_test=train_test_split(
    sentences, ratings , test_size=0.05, random_state=42
)

print( 'No. of sentences in Training Set:', len(X_train) )
print( 'No. of sentences in Testing Set:', len(X_test) )

## Tokenize the string and perform padding on the string


In [None]:
# Set hyperparameters
vocab_size = 20000
embedding_dim = 32
oov_tok = "<OOV>"
trunc_type='post'
max_length = 220
padding_type='post'

# Fit tokenizer on the training data
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

## Process the input sentences to indexed vectors of dimension (max_length , 1) each.

In [None]:
def prepare_input(X):
    '''Get the padded sequences'''
    sequences = tokenizer.texts_to_sequences(X)
    padded = pad_sequences(
                           sequences,
                           maxlen=max_length,
                           truncating=trunc_type,
                           padding=padding_type
                        )
    return padded

 # Prepare testing and training data
train_sent = prepare_input(X_train)
testing_sent = prepare_input(X_test)

## Defining a model

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,input_length=max_length,name='Embed_Layer'),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True,),name='BiLSTM_Layer1'),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32),name='BiLSTM_Layer2'),
    tf.keras.layers.Dense(128, activation='relu',name='Dense_Layer1'),
    tf.keras.layers.Dropout(0.3,name='Dropout_layer'),
    tf.keras.layers.Dense(5, activation='softmax',name='Output_Layer')
])

model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

## Training the model

Might take a while to train depending upon the no of epochs


In [None]:
num_epochs = 5
model.fit(
          train_sent, 
          np.array(y_train),
          batch_size=1000,
          epochs=num_epochs, 
          validation_data=( testing_sent , np.array(y_test) )
)

## Try the model on new sentences

In [None]:
# Try it on new sentences
new_sent = 'The product pretty good and I love it.'
new_sent = [new_sent]
new_data = prepare_input(new_sent)
rating = np.argmax( model.predict(new_data) ) + 1
print('Your sentence:',new_sent[0] )
print('Rating given by model is:',rating)