# Case Study -- NLP Amazon Reviews (code)

### Version 0.1
#### Simon Yang
last update: 25th September 2021

### Import libraries and dependant data

In [1]:
import nltk
import sys
import json
from matplotlib import pyplot as plt
import time
import pandas as pd
import numpy as np
import string
import fasttext
import contractions
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import ne_chunk, pos_tag
from nltk.tree import Tree
import tensorflow as tf
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.utils import shuffle
import pickle

### Download Amaazon review data (optional)
For this script to be fully self-contained, the data can be pulled from the web, here. 
For the purpose of this exercise, the data was pulled with wget and uncompressed to './data/'.

In [2]:
# import urllib
# todownload = 'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Apps_for_Android_5.json.gz'
# urllib.urlretrieve("todownload", "reviews_Apps_for_Android_5.json.gz")

### Read data

In [3]:
amzr = []
for line in open('./data/reviews_Apps_for_Android_5.json', 'r'):
    amzr.append(json.loads(line))

### Repackage data into pandas dataframe

##### First we retrieve all possible unique user key to define our table's columns

In [4]:
columns = set([i for r in amzr for i in r.keys()])
# Create dictionarry -- this will be used to create our DataFrame
amzr_df = {c:[] for c in columns}

##### Let's fill our table with data

In [5]:
for row in amzr: # loop through data and append to dict
    for key in columns:
        amzr_df[key].append(row.get(key))
amzr_df = pd.DataFrame(amzr_df)

### Clean-up data:
1. remove empty review texts
2. remove dupplicates

In [6]:
amzr_clean = amzr_df[~amzr_df['reviewText'].isnull()]
amzr_clean.drop_duplicates(subset=['reviewerID', 'asin', 'unixReviewTime'],inplace=True)
amzr_clean = amzr_clean.reset_index()

### Retreive helpfullness and bin into categories:
- categories are from 1 to 5, i.e, from least helpfull to most helpfull
- "None" catogory indicates no helpfulness rating

In [7]:
amzr_clean['helpful_num'] = amzr_clean['helpful'].apply(lambda x: x[0])
amzr_clean['helpful_den'] = amzr_clean['helpful'].apply(lambda x: x[1])
amzr_clean['helpful_pct'] = np.where(amzr_clean['helpful_den'] > 0,
                                  amzr_clean['helpful_num'] / amzr_clean['helpful_den'], -1)
amzr_clean['helpfulness'] = pd.cut(x=amzr_clean['helpful_pct'], bins=[-1, 0, 0.2, 0.4, 0.6, 0.8, 1.0],
                                         labels=['None', '1', '2', '3', '4', '5'], include_lowest=True)
amzr_clean['sentiment'] = pd.cut(x=amzr_clean['overall'], bins=[-1, 1.5, 3.5, 6],
                                         labels=[0,1,2], include_lowest=True)
amzr_clean = amzr_clean.drop(columns=['helpful_num','helpful_pct'])

We now have the follwing table: 

In [8]:
display(amzr_clean)

Unnamed: 0,index,asin,reviewerID,unixReviewTime,summary,reviewTime,reviewerName,reviewText,helpful,overall,helpful_den,helpfulness,sentiment
0,0,B004A9SDD8,A1N4O8VOJZTDVB,1383350400,Really cute,"11 2, 2013",Annette Yancey,"Loves the song, so he really couldn't wait to ...","[1, 1]",3.0,1,5,1
1,1,B004A9SDD8,A2HQWU6HUKIEC7,1323043200,2-year-old loves it,"12 5, 2011","Audiobook lover ""Kathy""","Oh, how my little grandson loves this app. He'...","[0, 0]",5.0,0,,2
2,2,B004A9SDD8,A1SXASF6GYG96I,1337558400,Fun game,"05 21, 2012",Barbara Gibbs,I found this at a perfect time since my daught...,"[0, 0]",5.0,0,,2
3,3,B004A9SDD8,A2B54P9ZDYH167,1354752000,We love our Monkeys!,"12 6, 2012","Brooke Greenstreet ""Babylove""",My 1 year old goes back to this game over and ...,"[3, 4]",5.0,4,4,2
4,4,B004A9SDD8,AFOFZDTX5UC6D,1391212800,This is my granddaughters favorite app on my K...,"02 1, 2014",C. Galindo,There are three different versions of the song...,"[1, 1]",5.0,1,5,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
752932,752932,B00LUEMK44,AZJ11YS0E52AI,1405814400,Five Stars,"07 20, 2014",K. Perna,I love it!!!!!!!!!! really keeps your attenti...,"[0, 0]",5.0,0,,2
752933,752933,B00LUEMK44,A2550XGZEFDH2Y,1405900800,... are so many free ones that are so much bet...,"07 21, 2014","Melanie G. Nihart ""Grammy""",Okay but there are so many free ones that are ...,"[0, 0]",3.0,0,,1
752934,752934,B00LUEMK44,A1KNDB16TG5QXD,1405900800,Enjoyable,"07 21, 2014",P. O'Reilly,Another great jewels game that just keeps you ...,"[0, 0]",4.0,0,,2
752935,752935,B00LUEMK44,A1IHFHA5LI9SGI,1405814400,entertaining,"07 20, 2014",redhatflusher,I find this the best jewels star ever. There s...,"[0, 0]",5.0,0,,2


### Text processing
We create a function that preprocesses the reviews with the following steps:
- remove contractions
- make lower-case
- remove punctuations
- lemmatization
We do not remove stop-words since the list contains many negative sentiment indicators. 

In [9]:
def process_text(df_in, cols=['reviewText','summary']):
    df =  df_in.copy()
    # Columns to drop (intermediate processed strings) 
    cols_ext_todrop = ['nocontract','nocontract_str','tokenized','lower','no_punc','pos_tags','wordnet_pos']
    stop_words = set(stopwords.words('english')) - {'not','no','nor',"aren't","isn't"}
    # Remove contractions
    for c in cols:
        df['nocontract'] = df[c].apply(lambda x: [contractions.fix(word) for word in x.split()])
        df['nocontract_str'] = [' '.join(map(str, l)) for l in df['nocontract']]
        # Tokenize
        df['tokenized'] = df['nocontract_str'].apply(word_tokenize)
        # make lower-case
        df['lower'] = df['tokenized'].apply(lambda x: [word.lower() for word in x])
        # remove punctuation
        punc = string.punctuation
        df['no_punc'] = df['lower'].apply(lambda x: [word for word in x if word not in punc])
#         # remove stop-words
#         df['stopwords_removed'] = df['no_punc'].apply(lambda x: [word for word in x if word not in stop_words])
        # tag words
        df['pos_tags'] = df['no_punc'].apply(nltk.tag.pos_tag)
        # lemmatize
        def get_wordnet_pos(tag):
            if tag.startswith('J'):
                return wordnet.ADJ
            elif tag.startswith('V'):
                return wordnet.VERB
            elif tag.startswith('N'):
                return wordnet.NOUN
            elif tag.startswith('R'):
                return wordnet.ADV
            else:
                return wordnet.NOUN
        df['wordnet_pos'] = df['pos_tags'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])
        wnl = WordNetLemmatizer()
        df[c+'_processed'] = df['wordnet_pos'].apply(lambda x: " ".join([wnl.lemmatize(word, tag) for word, tag in x]))
        # drop intermediate columns
        df=df.drop(columns=cols_ext_todrop)
    return df

### Inspect Helpfulness

### Using helpulness to filter the training data
1. use the data as is
2. filter for helpful reviews where helfulness has been assesed by at least 5 people

In [11]:
nfilt = 5 # number of ratings threshold
amzr_keep_helful = amzr_clean[(amzr_clean['helpfulness'].isin(['3','4','5'])) & (amzr_clean['helpful_den'].astype(int) >= nfilt)]

### Sample data for training and testing (balanced and imbalanced)
Because of ressource limitation we sample our data. 

We sample the data without replacement to create a training set and a testing set.

Note that the text processing is performed here with the function defined above

In [13]:
n_train=100 # training sample
n_test=100 # testing sample

Case 1 -- all the data

In [15]:
amzr_clean_sample = process_text(amzr_clean.sample(n=n_train + n_test, replace=False, random_state=1))
amzr_test_sample = amzr_clean_sample.iloc[n_train:]
amzr_clean_sample = amzr_clean_sample.iloc[:n_train]


Case 2 -- only sample helpfull reviews

In [17]:
amzr_keep_helful[~amzr_keep_helful.isin(amzr_test_sample)].dropna() # remove rows from test sample

amzr_keep_helful_sample = process_text(amzr_keep_helful.sample(n=n_train + n_test, replace=False, random_state=1))
amzr_keep_helful_test_sample = amzr_keep_helful_sample.iloc[n_train:]
amzr_keep_helful_sample = amzr_keep_helful_sample.iloc[:n_train]

Case 3 -- all the data (balanced)

In [18]:
amzr_clean[~amzr_clean.isin(amzr_test_sample)].dropna() # remove rows from test sample

g = amzr_clean.groupby('sentiment')
amzr_clean_sample_balanced=shuffle(process_text(pd.DataFrame(g.apply(lambda x: x.sample(int((n_train + n_test)/3),replace=False))))).reset_index(drop=True)
amzr_test_sample_balanced = amzr_clean_sample_balanced.iloc[n_train:]
amzr_clean_sample_balanced = amzr_clean_sample_balanced.iloc[:n_train]

Case 4 -- only sample helpfull reviews (balanced)

In [19]:
amzr_keep_helful[~amzr_keep_helful.isin(amzr_test_sample)].dropna() # remove rows from test sample

g = amzr_keep_helful.groupby('sentiment')
amzr_keep_helful_sample_balanced=shuffle(process_text(pd.DataFrame(g.apply(lambda x: x.sample(int((n_train + n_test)/3),replace=False))))).reset_index(drop=True)
amzr_keep_helful_test_sample_balanced = amzr_keep_helful_sample_balanced.iloc[n_train:]
amzr_keep_helful_sample_balanced = amzr_keep_helful_sample_balanced.iloc[:n_train]

#### Display example training data (Case 1)

In [20]:
amzr_clean_sample_balanced

Unnamed: 0,index,asin,reviewerID,unixReviewTime,summary,reviewTime,reviewerName,reviewText,helpful,overall,helpful_den,helpfulness,sentiment,reviewText_processed,summary_processed
0,321789,B008AZGWYK,A3992OP6NR7AS,1390262400,Not very good,"01 21, 2014",goalie,"Expecting so much more,As with most apps, it i...","[0, 0]",2.0,0,,1,expect so much more a with most apps it be ber...,not very good
1,641856,B00EBQRSTK,A3EPDWEO3OAOGL,1377475200,eh...,"08 26, 2013",emmazmom,Not very challenging. The first levels are ver...,"[0, 0]",3.0,0,,1,not very challenge the first level be very eas...,eh ...
2,449579,B00A3NQNUU,A1VJ6K6ZD3RKMW,1372982400,This game rocks!,"07 5, 2013",Laxdyn,"I love this game, and so does my entire family...","[0, 0]",5.0,0,,2,i love this game and so do my entire family we...,this game rock
3,751949,B00KWZ88N4,A23LAZFDYHSWEO,1403395200,annoying,"06 22, 2014",boreing and hard to control,Its really fun but everytime i try to change m...,"[6, 8]",3.0,8,4,1,it really fun but everytime i try to change my...,annoy
4,224141,B007A2CLGC,A2PL4EQDMGGK7X,1340928000,I used to be a non believer too,"06 29, 2012",MadisonLastrega,I used to think &quot; hey internet access wha...,"[9, 10]",2.0,10,5,1,i use to think quot hey internet access what c...,i use to be a non believer too
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,363041,B008P4XLG8,ALSF01BYJMVUF,1395532800,Fun but frustrating too,"03 23, 2014",JoshDavis,I think to get the most of this game it needs ...,"[0, 0]",3.0,0,,1,i think to get the most of this game it need t...,fun but frustrate too
96,599397,B00D2XD5JU,AFOBKJ9G6NVMG,1378684800,fruit quest,"09 9, 2013",nancy jordan,I cannot figure out how to play this game. The...,"[0, 0]",1.0,0,,0,i can not figure out how to play this game the...,fruit quest
97,570742,B00CHSPB8Y,A3BDWPYY0LNVG7,1368057600,FE MAN 3,"05 9, 2013",Mr Longname,It's an alright app but laggy and awkward cont...,"[17, 18]",4.0,18,5,2,it be an alright app but laggy and awkward con...,fe man 3
98,501656,B00AUTXEL8,AE5Z69043W4MI,1384387200,flag logo quiz,"11 14, 2013","David. Corner ""M D CORNER""",i dont like this game as you have to spell the...,"[0, 1]",1.0,1,,0,i do not like this game a you have to spell th...,flag logo quiz


### Train model

- split data into training and validation set
- encode text reviews with BERT tokenizer
- set-up model as a three-class clasiffhyperparametersier
- set model hyperparameters

In [21]:
def set_up_model(df,features ='reviewText_processed',labels = 'sentiment',max_length=256):
    reviews = df[features].values.tolist()
    labels = df[labels].tolist()
    training_sentences, validation_sentences, training_labels, validation_labels = train_test_split(reviews, labels, test_size=.2)
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
    train_encodings = tokenizer(training_sentences,
                                truncation=True,
                                padding=True,
                                max_length=max_length)
    val_encodings = tokenizer(validation_sentences,
                                truncation=True,
                                padding=True,
                                max_length=max_length)
    train_dataset = tf.data.Dataset.from_tensor_slices((
                                dict(train_encodings),
                                training_labels
                                ))
    val_dataset = tf.data.Dataset.from_tensor_slices((
                                dict(val_encodings),
                                validation_labels
                                ))
    callback = tf.keras.callbacks.EarlyStopping(monitor='loss',patience = 3)
    model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=3)
    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5, epsilon=1e-08)
    model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])
    return model, train_dataset, val_dataset

In [22]:
def compute_test_accuracy(df,model, features = 'reviewText_processed',labels='sentiment',max_length=256):
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
    test_sentences = df[features].values.tolist()
    test_labels = df[labels].tolist()
    tf_batch = [tokenizer(i, padding=True, truncation=True, max_length=max_length, return_tensors='tf') for i in test_sentences]
    tf_outputs = [model(i) for i in tf_batch]
    tf_predictions = [tf.nn.softmax(i[0], axis=-1) for i in tf_outputs]
    pred_test_labels = [tf.argmax(i, axis=1).numpy()[0] for i in tf_predictions]
    metric = tf.keras.metrics.Accuracy()
    metric.update_state(test_labels,pred_test_labels)
    return metric.result().numpy(),test_labels,pred_test_labels

In [23]:
def read_write_pkl(ppath,oobj=None, opt = 'r'):
        if opt == "w":
            if oobj is None:
                print("Please specify an object to save (oobj=object2save")
                return
            # Saving the objects:
            with open(ppath, 'wb') as f:  
                pickle.dump(oobj, f)
        elif opt == "r":
            with open(ppath,'rb') as f: 
                oobj = pickle.load(f) 
            return oobj

#### Train 24 instances of the model with various features and parameters and evaluate performance

In [None]:
train_cases = {'unbalanced':amzr_clean_sample, 'balanced':amzr_clean_sample_balanced,'helpful_balanced':amzr_keep_helful_sample_balanced}
feature_cases = ['reviewText','reviewText_processed','summary','summary_processed']
max_length = [256,512]
batch_size = 4
model_results = {}
for l in max_length:
    for name,t in train_cases.items():
        for f in feature_cases:
            print('running '+'run_'+name+"_"+f+"_"+str(l))
            if f in ['summary','summary_processed']:
                max_length = 64
                batch_size=8
            model, train_dataset, val_dataset= set_up_model(t,max_length=l)
            history = model.fit(train_dataset.shuffle(100).batch(batch_size),
                  epochs=3,
                  batch_size=batch_size,
                  validation_data=val_dataset.shuffle(100).batch(batch_size))
            acc,test_labels,pred_test_labels = compute_test_accuracy(amzr_test_sample,model,max_length=l)
            model_results['run_'+name+"_"+f+"_"+"_"+str(l)]  = {'parameters':{'max_length':l,'batch_size':batch_size,'Case':name},'test_acc':acc,'history_acc':history.history['accuracy'],'history_valacc':history.history['val_accuracy'],'test_labels':test_labels,"pred_test_labels":pred_test_labels}
            model.save_pretrained("./sentiment_run_"+name+"_"+f+"_"+"_"+str(l))
            with open('./out/model_results.pkl', 'wb') as f:  
                pickle.dump(model_results, f)          

running run_unbalanced_reviewText_256


2021-09-27 01:58:54.802632: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2021-09-27 01:58:54.886167: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-27 01:58:54.886610: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce GTX 1660 computeCapability: 7.5
coreClock: 1.86GHz coreCount: 22 deviceMemorySize: 5.80GiB deviceMemoryBandwidth: 178.86GiB/s
2021-09-27 01:58:54.886669: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2021-09-27 01:58:54.886731: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2021-09-27 01:58:54.886774: I tensorflow/stream_executor/plat

Epoch 1/3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Epoch 2/3
Epoch 3/3
running run_unbalanced_reviewText_processed_256


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'vocab_transform', 'vocab_projector', 'activation_13']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'dropout_39', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

Epoch 1/3
Epoch 2/3
Epoch 3/3
running run_unbalanced_summary_256


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'vocab_transform', 'vocab_projector', 'activation_13']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'dropout_59', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

Epoch 1/3
Epoch 2/3
Epoch 3/3
running run_unbalanced_summary_processed_256


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'vocab_transform', 'vocab_projector', 'activation_13']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'dropout_79', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

Epoch 1/3
