# Sentiment analysis with TensorFlow


## [TensorFlow 2 in Action ](https://www.manning.com/books/tensorflow-in-action)

* Link: https://www.manning.com/books/tensorflow-in-action

### Topics covered:

* All popular deep learning models like CNNs, GRUs, LSTMs and Transformers
* Computer vision tasks like image classification and segmentation
* NLP tasks like sentiment analysis, language modelling, machine translation
* Creating complex data pipelines
* MLOps: Monitoring and deploying your models

<table align="center">
    <td>
        <img src="book.png" />
    </td>
</table>

Use the discount code **twitgane40** to get 40% off TensorFlow in Action

In [1]:
import tensorflow as tf
#import tensorflow_hub as hub
import requests
print(tf.__version__)
import zipfile
import os
import time
import pandas as pd
import random
import shutil
import os
import tensorflow.keras.backend as K
import numpy as np
import pickle
from tensorflow.keras.models import load_model, Model
import matplotlib.pyplot as plt
from functools import partial
import nltk

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except:
        print("Couldn't set memory_growth")
        pass
    
    
def fix_random_seed(seed):
    """ Setting the random seed of various libraries """
    try:
        np.random.seed(seed)
    except NameError:
        print("Warning: Numpy is not imported. Setting the seed for Numpy failed.")
    try:
        tf.random.set_seed(seed)
    except NameError:
        print("Warning: TensorFlow is not imported. Setting the seed for TensorFlow failed.")
    try:
        random.seed(seed)
    except NameError:
        print("Warning: random module is not imported. Setting the seed for random failed.")

# Fixing the random seed
random_seed=4321
fix_random_seed(random_seed)

2.2.1


## Downloading data

In [2]:
# Downloading the data
# http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Video_Games_5.json.gz

import os
import requests
import gzip
import shutil

# Retrieve the data
if not os.path.exists(os.path.join('data','Video_Games_5.json.gz')):
    url = "http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Video_Games_5.json.gz"
    # Get the file from web
    r = requests.get(url)

    if not os.path.exists('data'):
        os.mkdir('data')
    
    # Write to a file
    with open(os.path.join('data','Video_Games_5.json.gz'), 'wb') as f:
        f.write(r.content)
else:
    print("The tar file already exists.")
    
if not os.path.exists(os.path.join('data', 'Video_Games_5.json')):
    with gzip.open(os.path.join('data','Video_Games_5.json.gz'), 'rb') as f_in:
        with open(os.path.join('data','Video_Games_5.json'), 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
else:
    print("The extracted data already exists")


The tar file already exists.
The extracted data already exists


## Loading the review data

In [3]:
import pandas as pd

# Read the JSON file
review_df = pd.read_json(os.path.join('data', 'Video_Games_5.json'), lines=True, orient='records')
# Select on the columns we're interested in 
review_df = review_df[["overall", "verified", "reviewTime", "reviewText"]]
review_df.head()

Unnamed: 0,overall,verified,reviewTime,reviewText
0,5,True,"10 17, 2015","This game is a bit hard to get the hang of, bu..."
1,4,False,"07 27, 2015",I played it a while but it was alright. The st...
2,3,True,"02 23, 2015",ok game.
3,2,True,"02 20, 2015","found the game a bit too complicated, not what..."
4,5,True,"12 25, 2014","great game, I love it and have played it since..."


## Clearning data

In [4]:
print("Before cleaning up: {}".format(review_df.shape))
review_df = review_df[~review_df["reviewText"].isna()]
review_df = review_df[review_df["reviewText"].str.strip().str.len()>0]
print("After cleaning up: {}".format(review_df.shape))

Before cleaning up: (497577, 4)
After cleaning up: (497419, 4)


# EDA

## Verified reviews vs unverified

In [5]:
review_df["verified"].value_counts()

True     332504
False    164915
Name: verified, dtype: int64

## Star counts

In [6]:
verified_df = review_df.loc[review_df["verified"], :]
verified_df["overall"].value_counts()

5    222335
4     54878
3     27973
1     15200
2     12118
Name: overall, dtype: int64

## Mapping stars to positive and negative labels

In [7]:
# Use pandas map function to map different star ratings to 0/1
verified_df["label"]=verified_df["overall"].map({5:1, 4:1, 3:0, 2:0, 1:0})
verified_df["label"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


1    277213
0     55291
Name: label, dtype: int64

## Shuffle the data

In [8]:
# We are sampling 100% of the data in a random fashion, leading to a shuffled dataset
verified_df = verified_df.sample(frac=1.0, random_state=random_seed)

# Splint the data to inputs (inputs) and targets (labels)
inputs, labels = verified_df["reviewText"], verified_df["label"]

## Preprocessing the text


In [9]:
import nltk
# We need to download several nltk artefacts to perform the preprocessing
nltk.download('stopwords', download_dir='nltk')
nltk.download('punkt', download_dir='nltk')
nltk.data.path.append(os.path.abspath('nltk'))

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string

# Define the English stopwords
EN_STOPWORDS = set(stopwords.words('english')) - {'not', 'no'}

# Apply the transformation to the full text
# this is time consuming
print("\nProcessing all the review data ...")
inputs = inputs.str.lower()

# Remove punctuation 
inputs = inputs.str.replace('[{}]'.format(string.punctuation), '')
inputs = inputs.str.replace("n\'t ", " not ")
inputs = inputs.str.replace(r"(?:\'ll |\'re |\'d |\'ve )", " ")
inputs = inputs.str.replace(r"/d+","")

stopwords_regex = "(?:" + " | ".join(EN_STOPWORDS) + ")"
inputs = inputs.str.replace(stopwords_regex, " ")
inputs = inputs.str.split()

print("Sample data")
print(inputs.head(n=25))

[nltk_data] Downloading package stopwords to nltk...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to nltk...
[nltk_data]   Package punkt is already up-to-date!



Processing all the review data ...
Sample data
122143    [worked, perfectly, wii, gamecube, no, issues,...
444818    [loved, game, the, collectibles, came, it, wel...
79331     [its, okay, game, be, honest, am, bad, these, ...
97250                       [excellent, product, described]
324411    [the, level, detail, great, can, feel, love, c...
427782           [buy, game, buy, vita, buy, game, amazing]
200896    [excellent, product, very, useful, 100, recomm...
394648    [these, great, amiibos, add, your, collection,...
480002    [first, zenses, game, bought, got, other, two,...
194630                                         [nice, game]
122215    [awesome, memory, card, worked, great, both, w...
278172    [ive, started, playing, believe, is, great, ga...
233269    [this, game, cool, alot, like, remembered, som...
299538    [this, might, seem, interesting, you, bought, ...
229458    [short, had, movement, issues, not, bad, could...
34481     [i, dont, actually, believe, writing, revi

## Print some data

In [10]:
for actual, clean in zip(verified_df["reviewText"].iloc[:5], inputs.iloc[:5]):
    print("Actual: {}".format(actual))
    print("Clean: {}".format(clean))
    print('\n')

Actual: Worked perfectly on Wii and Gamecube.
No issues with compatibility or loss of memory.
Clean: ['worked', 'perfectly', 'wii', 'gamecube', 'no', 'issues', 'compatibility', 'loss', 'memory']


Actual: Loved the game and the other collectibles that came with it are well made.  The mask is big and it almost fits my face so that was impressive.
Clean: ['loved', 'game', 'the', 'collectibles', 'came', 'it', 'well', 'made', 'mask', 'big', 'it', 'almost', 'fits', 'face', 'that', 'impressive']


Actual: It's an okay game, to be honest, I am very bad at these types of games and to me-- it's very difficult! I am always dying, which depresses me. Maybe if I had more skill I would enjoy this game more!
Clean: ['its', 'okay', 'game', 'be', 'honest', 'am', 'bad', 'these', 'types', 'games', 'to', 'its', 'difficult', 'am', 'always', 'dying', 'depresses', 'maybe', 'i', 'more', 'skill', 'would', 'enjoy', 'game', 'more']


Actual: Excellent product as described
Clean: ['excellent', 'product', 'descri

## Splitting training/validation/testing data

- Refer presentation for the detailed explanation

In [11]:
def train_valid_test_split(inputs, labels, train_fraction=0.8):
    """ Splits a given dataset into three sets; training, validation and test """    
    
    # Separate indices of negative and positive data points
    neg_indices = pd.Series(labels.loc[(labels==0)].index)
    pos_indices = pd.Series(labels.loc[(labels==1)].index)
    
    n_valid = int(min([len(neg_indices), len(pos_indices)]) * ((1-train_fraction)/2.0))
    n_test = n_valid
    
    neg_test_inds = neg_indices.sample(n=n_test, random_state=random_seed)
    neg_valid_inds = neg_indices.loc[~neg_indices.isin(neg_test_inds)].sample(n=n_test, random_state=random_seed)
    neg_train_inds = neg_indices.loc[~neg_indices.isin(neg_test_inds.tolist()+neg_valid_inds.tolist())]
    
    pos_test_inds = pos_indices.sample(n=n_test, random_state=random_seed)
    pos_valid_inds = pos_indices.loc[~pos_indices.isin(pos_test_inds)].sample(n=n_test, random_state=random_seed)
    pos_train_inds = pos_indices.loc[
        ~pos_indices.isin(pos_test_inds.tolist()+pos_valid_inds.tolist())
    ]
    
    tr_x = inputs.loc[neg_train_inds.tolist() + pos_train_inds.tolist()].sample(frac=1.0, random_state=random_seed)
    tr_y = labels.loc[neg_train_inds.tolist() + pos_train_inds.tolist()].sample(frac=1.0, random_state=random_seed)
    v_x = inputs.loc[neg_valid_inds.tolist() + pos_valid_inds.tolist()].sample(frac=1.0, random_state=random_seed)
    v_y = labels.loc[neg_valid_inds.tolist() + pos_valid_inds.tolist()].sample(frac=1.0, random_state=random_seed)
    ts_x = inputs.loc[neg_test_inds.tolist() + pos_test_inds.tolist()].sample(frac=1.0, random_state=random_seed)
    ts_y = labels.loc[neg_test_inds.tolist() + pos_test_inds.tolist()].sample(frac=1.0, random_state=random_seed)
    
    print('Training data: {}'.format(len(tr_x)))
    print('Validation data: {}'.format(len(v_x)))
    print('Test data: {}'.format(len(ts_x)))
    
    return (tr_x, tr_y), (v_x, v_y), (ts_x, ts_y)
    
(tr_x, tr_y), (v_x, v_y), (ts_x, ts_y) = train_valid_test_split(inputs, labels)


Training data: 310388
Validation data: 11058
Test data: 11058


## Analysing the vocabulary

In [12]:
from collections import Counter
# Create a large list which contains all the words in all the reviews
data_list = [w for doc in tr_x for w in doc]

# Create a Counter object from that list
# Counter returns a dictionary, where key is a word and the value is the frequency
cnt = Counter(data_list)

# Convert the result to a pd.Series 
freq_df = pd.Series(list(cnt.values()), index=list(cnt.keys())).sort_values(ascending=False)
# Print most common words
print(freq_df.head(n=10))

# Print summary statistics
print(freq_df.describe())

the      497674
game     315080
a        221368
i        181830
it       133993
you      114603
this     114250
not      114052
great     96708
like      92532
dtype: int64
count    151379.000000
mean         84.342458
std        2093.623140
min           1.000000
25%           1.000000
50%           1.000000
75%           4.000000
max      497674.000000
dtype: float64


## Vocabulary size

In [13]:
n_vocab = (freq_df >= 25).sum()
print("Using a vocabulary of size: {}".format(n_vocab))

Using a vocabulary of size: 14513


## Analysing the sequence length

In [14]:
# Create a pd.Series, which contain the sequence length for each review
seq_length_ser = tr_x.str.len()

# Get the median as well as summary statistics of the sequence length
print("\nSome summary statistics")
print("Median length: {}\n".format(seq_length_ser.median()))
print(seq_length_ser.describe())

print("\nComputing the statistics between the 10% and 90% quantiles (to ignore outliers)")
p_10 = seq_length_ser.quantile(0.1)
p_90 = seq_length_ser.quantile(0.9)

print(seq_length_ser[(seq_length_ser >= p_10) & (seq_length_ser < p_90)].describe())


Some summary statistics
Median length: 15.0

count    310388.000000
mean         41.134570
std          92.247816
min           0.000000
25%           4.000000
50%          15.000000
75%          38.000000
max        3723.000000
Name: reviewText, dtype: float64

Computing the statistics between the 10% and 90% quantiles (to ignore outliers)
count    254829.000000
mean         21.354069
std          20.845726
min           2.000000
25%           5.000000
50%          15.000000
75%          29.000000
max          94.000000
Name: reviewText, dtype: float64


## Sequence length

In [15]:
n_seq_mid = 15
n_seq_max = 30

## [TensorFlow 2 in Action ](https://www.manning.com/books/tensorflow-in-action)

* Link: https://www.manning.com/books/tensorflow-in-action

### Topics covered:

* All popular deep learning models like CNNs, GRUs, LSTMs and Transformers
* Computer vision tasks like image classification and segmentation
* NLP tasks like sentiment analysis, language modelling, machine translation
* Creating complex data pipelines
* MLOps: Monitoring and deploying your models

<table align="center">
    <td>
        <img src="book.png" />
    </td>
</table>

Use the discount code **twitgane40** to get 40% off TensorFlow in Action

## Defining a Tokenizer

In [16]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Define a tokenizer that will convert words to IDs
# words that are less frequent will be replaced by 'unk'
tokenizer = Tokenizer(num_words=n_vocab, oov_token='unk', lower=False)

# Fit the tokenizer on the data
tokenizer.fit_on_texts(tr_x.tolist())


## Checking the attributes of the tokenizer

In [17]:
# Checking the attributes of the tokenizer
word = "game"
wid = tokenizer.word_index[word]
print("The word id for \"{}\" is: {}".format(word, wid))
wid = 4
word = tokenizer.index_word[wid]
print("The word for id {} is: {}".format(wid, word))

The word id for "game" is: 3
The word for id 4 is: a


## Let's convert some text

In [18]:
# Convert words to IDs
test_text = [
    ['work', 'perfectly', 'wii', 'gamecube', 'issue', 'compatibility', 'loss', 'memory'],
    ['loved', 'game', 'collectible', 'come', 'well', 'make', 'mask', 'big', 'almost', 'fit', 'face', 'impressive'],
    ["'s", 'okay', 'game', 'honest', 'bad', 'type', 'game', '--', "'s", 'difficult', 'always', 'die', 'depresses', 'maybe', 'skill', 'would', 'enjoy', 'game'],
    ['excellent', 'product', 'describe'],
    ['level', 'detail', 'great', 'feel', 'love', 'car', 'game']
]

test_seq = tokenizer.texts_to_sequences(test_text)

for text, seq in zip(test_text, test_seq):
    print("Text: {}".format(text))
    print("Sequence: {}".format(seq))
    print("\n")

Text: ['work', 'perfectly', 'wii', 'gamecube', 'issue', 'compatibility', 'loss', 'memory']
Sequence: [87, 359, 115, 692, 389, 2355, 2830, 537]


Text: ['loved', 'game', 'collectible', 'come', 'well', 'make', 'mask', 'big', 'almost', 'fit', 'face', 'impressive']
Sequence: [213, 3, 4419, 219, 30, 72, 3237, 189, 255, 324, 922, 1321]


Text: ["'s", 'okay', 'game', 'honest', 'bad', 'type', 'game', '--', "'s", 'difficult', 'always', 'die', 'depresses', 'maybe', 'skill', 'would', 'enjoy', 'game']
Sequence: [1, 690, 3, 1463, 137, 400, 3, 1, 1, 335, 184, 716, 1, 354, 898, 26, 185, 3]


Text: ['excellent', 'product', 'describe']
Sequence: [140, 84, 2330]


Text: ['level', 'detail', 'great', 'feel', 'love', 'car', 'game']
Sequence: [167, 897, 10, 97, 31, 644, 3]




## Convert training/validation/test data to word IDs

In [19]:
tr_x_short = tr_x[(tr_x.str.len()<n_seq_mid)]
tr_x_long = tr_x[(tr_x.str.len()>=n_seq_mid)]

# Training labels separated to short and long
tr_y_short = tr_y[(tr_x.str.len()<n_seq_mid)]
tr_y_long = tr_y[(tr_x.str.len()>=n_seq_mid)]

# Convert all of train/validation/test data to sequences of IDs
tr_x_short_seq = tokenizer.texts_to_sequences(tr_x_short.tolist())
tr_x_long_seq = tokenizer.texts_to_sequences(tr_x_long.tolist())

v_x_seq = tokenizer.texts_to_sequences(v_x.tolist())
ts_x_seq = tokenizer.texts_to_sequences(ts_x.tolist())



## Padding to a fixed length

- Refer to the presentation for more details

In [20]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

tr_x_short_pad = pad_sequences(
    tr_x_short_seq, maxlen=n_seq_mid, padding='pre',truncating='post'
)
print('Training (Short): {}'.format(tr_x_short_pad.shape))

tr_x_long_pad = pad_sequences(
    tr_x_long_seq, maxlen=n_seq_max, padding='pre', truncating='post'
)
print('Training (Long): {}'.format(tr_x_long_pad.shape))

print('Sample training data')
print(tr_x_short_pad[:5])

v_x_pad = pad_sequences(v_x_seq, maxlen=n_seq_max, padding='pre', truncating='post')
print('\nValid: {}'.format(v_x_pad.shape))

ts_x_pad = pad_sequences(ts_x_seq, maxlen=n_seq_max, padding='pre', truncating='post')
print('Test: {}'.format(ts_x_pad.shape))


Training (Short): (148470, 15)
Training (Long): (161918, 30)
Sample training data
[[  0   0   0   0   0   0   0   0   0   0  63  64 416  25 800]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0  15]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0  15]
 [  0   0   0   0   0  92  14 225 155  54 104  14 132   1 175]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0  10]]

Valid: (11058, 30)
Test: (11058, 30)


## Defining the model

- Refer the presentation for detailed visualizations of the model and its components

In [21]:
import tensorflow.keras.backend as K

K.clear_session()

model = tf.keras.models.Sequential([
    # Create a mask to mask out zero inputs
    #tf.keras.layers.Masking(mask_value=0.0, input_shape=(None,)),    
    # Adding an Embedding layer
    tf.keras.layers.Embedding(input_dim=n_vocab+1, output_dim=128, 
                              mask_zero=True, 
                              input_shape=(None,)),
    # Defining an LSTM layer
    tf.keras.layers.LSTM(128, return_state=False, return_sequences=False),
    # Defining Dense layers
    tf.keras.layers.Dense(512, activation='relu'),
    # Defining a dropout layer
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 128)         1857792   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               131584    
_________________________________________________________________
dense (Dense)                (None, 512)               66048     
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 513       
Total params: 2,055,937
Trainable params: 2,055,937
Non-trainable params: 0
_________________________________________________________________


## Training the model

In [84]:
neg_weight = (tr_y==1).sum()/(tr_y==0).sum()
print("Will be using a weight of {} for negative samples".format(neg_weight))

Will be using a weight of 6.017113919471887 for negative samples


In [85]:
epochs = 3
batch_size = 128

n_short, n_long = tr_x_short_pad.shape[0], tr_x_long_pad.shape[0]

short_ratio = n_short/(n_short + n_long)

for e in range(epochs):
    
    print("Epoch {}".format(e+1))
    n_iter = (n_short + n_long)//batch_size
    
    for i in range(n_iter):
        
        if np.random.uniform(0,1) <= short_ratio:
            # pick a short batch
            batch_ids = np.random.randint(0, n_short, size=[batch_size])
            x = tr_x_short_pad[batch_ids, :]
            y = tr_y_short.iloc[batch_ids]
        else:
            # pick a long batch
            batch_ids = np.random.randint(0, n_long, size=[batch_size])
            x = tr_x_long_pad[batch_ids, :]
            y = tr_y_long.iloc[batch_ids]
            
        model.train_on_batch(x, y, class_weight={0:neg_weight, 1:1.0})
    
    print("\tEvaluating...")
    res = model.evaluate(v_x_pad, v_y)


Epoch 1
	Evaluating...
Epoch 2
	Evaluating...
Epoch 3
	Evaluating...


## Testing the model

## [TensorFlow 2 in Action ](https://www.manning.com/books/tensorflow-in-action)

* Link: https://www.manning.com/books/tensorflow-in-action

### Topics covered:

* All popular deep learning models like CNNs, GRUs, LSTMs and Transformers
* Computer vision tasks like image classification and segmentation
* NLP tasks like sentiment analysis, language modelling, machine translation
* Creating complex data pipelines
* MLOps: Monitoring and deploying your models



<table align="center">
    <td>
        <img src="book.png" />
    </td>
</table>

Use the discount code **twitgane40** to get 40% off TensorFlow in Action