# Predicting Amazon reviews of Android apps

### Version 1.0
#### Simon Yang
last update: 25th September 2021

### Import libraries

In [8]:
import nltk
import sys
import json
from matplotlib import pyplot as plt
import time
# from fuzzysearch import find_near_matches
import pandas as pd
import numpy as np
import string
import fasttext
import contractions
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import ne_chunk, pos_tag
from nltk.tree import Tree

import ntlk data for text clean-up

In [103]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/yangsim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/yangsim/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/yangsim/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/yangsim/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/yangsim/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/yangsim/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

### Download data (optional)
For this script to be fully self-contained, the data would be pulled here. 
For the purpose of this exercise, the data was pulled with wget and uncompressed to './data/'.

In [6]:
# import urllib
# todownload = 'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Apps_for_Android_5.json.gz'
# urllib.urlretrieve("todownload", "reviews_Apps_for_Android_5.json.gz")

### Read data (Amazon reviews)

In [22]:
amzr = []
for line in open('./data/reviews_Apps_for_Android_5.json', 'r'):
    amzr.append(json.loads(line))

### repackage data into pandas dataframe

##### First we retrieve all possible unique user key to define our table's columns

In [63]:
columns = set([i for r in amzr for i in r.keys()])
# Create dictionarry -- this will be used to create our DataFrame
amzr_df = {c:[] for c in columns}

##### Let's fill our table with data

In [64]:
for row in amzr: # loop through data and append to dict
    for key in columns:
        amzr_df[key].append(row.get(key))
amzr_df = pd.DataFrame(amzr_df)

In [96]:
amzr_df.head(5)

Unnamed: 0,reviewerID,overall,unixReviewTime,reviewText,asin,reviewTime,reviewerName,helpful,summary
0,A1N4O8VOJZTDVB,3.0,1383350400,"Loves the song, so he really couldn't wait to ...",B004A9SDD8,"11 2, 2013",Annette Yancey,"[1, 1]",Really cute
1,A2HQWU6HUKIEC7,5.0,1323043200,"Oh, how my little grandson loves this app. He'...",B004A9SDD8,"12 5, 2011","Audiobook lover ""Kathy""","[0, 0]",2-year-old loves it
2,A1SXASF6GYG96I,5.0,1337558400,I found this at a perfect time since my daught...,B004A9SDD8,"05 21, 2012",Barbara Gibbs,"[0, 0]",Fun game
3,A2B54P9ZDYH167,5.0,1354752000,My 1 year old goes back to this game over and ...,B004A9SDD8,"12 6, 2012","Brooke Greenstreet ""Babylove""","[3, 4]",We love our Monkeys!
4,AFOFZDTX5UC6D,5.0,1391212800,There are three different versions of the song...,B004A9SDD8,"02 1, 2014",C. Galindo,"[1, 1]",This is my granddaughters favorite app on my K...


### Clean-up data:
#### (1) remove empty review texts
#### (2) remove dupplicates

In [115]:
amzr_clean = amzr_df[~amzr_df['reviewText'].isnull()]
amzr_clean.drop_duplicates(subset=['reviewerID', 'asin', 'unixReviewTime'],inplace=True)
amzr_clean.reset_index()
amzr_clean.head(5)

Unnamed: 0,reviewerID,overall,unixReviewTime,reviewText,asin,reviewTime,reviewerName,helpful,summary
0,A1N4O8VOJZTDVB,3.0,1383350400,"Loves the song, so he really couldn't wait to ...",B004A9SDD8,"11 2, 2013",Annette Yancey,"[1, 1]",Really cute
1,A2HQWU6HUKIEC7,5.0,1323043200,"Oh, how my little grandson loves this app. He'...",B004A9SDD8,"12 5, 2011","Audiobook lover ""Kathy""","[0, 0]",2-year-old loves it
2,A1SXASF6GYG96I,5.0,1337558400,I found this at a perfect time since my daught...,B004A9SDD8,"05 21, 2012",Barbara Gibbs,"[0, 0]",Fun game
3,A2B54P9ZDYH167,5.0,1354752000,My 1 year old goes back to this game over and ...,B004A9SDD8,"12 6, 2012","Brooke Greenstreet ""Babylove""","[3, 4]",We love our Monkeys!
4,AFOFZDTX5UC6D,5.0,1391212800,There are three different versions of the song...,B004A9SDD8,"02 1, 2014",C. Galindo,"[1, 1]",This is my granddaughters favorite app on my K...


### Retreive helpfullness and bin into categories:
- categories are from 1 to 5, i.e, from least helpfull to most helpfull
- "None" catogory indicates no helpfulness rating

In [132]:
amzr_clean['helpful_num'] = amzr_clean['helpful'].apply(lambda x: x[0])
amzr_clean['helpful_den'] = amzr_clean['helpful'].apply(lambda x: x[1])
amzr_clean['helpful_pct'] = np.where(amzr_clean['helpful_den'] > 0,
                                  amzr_clean['helpful_num'] / amzr_clean['helpful_den'], -1)
amzr_clean['helpfulness'] = pd.cut(x=amzr_clean['helpful_pct'], bins=[-1, 0, 0.2, 0.4, 0.6, 0.8, 1.0],
                                         labels=['None', '1', '2', '3', '4', '5'], include_lowest=True)
amzr_clean['sentiment'] = pd.cut(x=amzr_clean['overall'], bins=[-1, 1.5, 3.5, 6],
                                         labels=[1,2,3], include_lowest=True)
amzr_clean = amzr_clean.drop(columns=['helpful_num','helpful_den','helpful_pct'])

In [133]:
amzr_clean.head(5)

Unnamed: 0,reviewerID,overall,unixReviewTime,reviewText,asin,reviewTime,reviewerName,helpful,summary,helpfulness,sentiment
0,A1N4O8VOJZTDVB,3.0,1383350400,"Loves the song, so he really couldn't wait to ...",B004A9SDD8,"11 2, 2013",Annette Yancey,"[1, 1]",Really cute,5.0,2
1,A2HQWU6HUKIEC7,5.0,1323043200,"Oh, how my little grandson loves this app. He'...",B004A9SDD8,"12 5, 2011","Audiobook lover ""Kathy""","[0, 0]",2-year-old loves it,,3
2,A1SXASF6GYG96I,5.0,1337558400,I found this at a perfect time since my daught...,B004A9SDD8,"05 21, 2012",Barbara Gibbs,"[0, 0]",Fun game,,3
3,A2B54P9ZDYH167,5.0,1354752000,My 1 year old goes back to this game over and ...,B004A9SDD8,"12 6, 2012","Brooke Greenstreet ""Babylove""","[3, 4]",We love our Monkeys!,4.0,3
4,AFOFZDTX5UC6D,5.0,1391212800,There are three different versions of the song...,B004A9SDD8,"02 1, 2014",C. Galindo,"[1, 1]",This is my granddaughters favorite app on my K...,5.0,3


### Text processing
- remove contractions
- make lower-case
- remove punctuations
- remove stop words
- lemmatization

In [136]:
def process_text(df_in, cols):
    df =  df_in.copy()
    # Columns to drop (intermediate processed strings) 
    cols_ext_todrop = ['nocontract','nocontract_str','tokenized','lower','no_punc','stopwords_removed','pos_tags','wordnet_pos']
    stop_words = set(stopwords.words('english')) - {'not','no','nor',"aren't","isn't"}
    # Remove contractions
    for c in cols:
        df['nocontract'] = df[c].apply(lambda x: [contractions.fix(word) for word in x.split()])
        df['nocontract_str'] = [' '.join(map(str, l)) for l in df['nocontract']]
        # Tokenize
        df['tokenized'] = df['nocontract_str'].apply(word_tokenize)
        # make lower-case
        df['lower'] = df['tokenized'].apply(lambda x: [word.lower() for word in x])
        # remove punctuation
        punc = string.punctuation
        df['no_punc'] = df['lower'].apply(lambda x: [word for word in x if word not in punc])
        # remove stop-words
        df['stopwords_removed'] = df['no_punc'].apply(lambda x: [word for word in x if word not in stop_words])
        # tag words
        df['pos_tags'] = df['stopwords_removed'].apply(nltk.tag.pos_tag)
        # lemmatize
        def get_wordnet_pos(tag):
            if tag.startswith('J'):
                return wordnet.ADJ
            elif tag.startswith('V'):
                return wordnet.VERB
            elif tag.startswith('N'):
                return wordnet.NOUN
            elif tag.startswith('R'):
                return wordnet.ADV
            else:
                return wordnet.NOUN
        df['wordnet_pos'] = df['pos_tags'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])
        wnl = WordNetLemmatizer()
        df[c+'_processed'] = df['wordnet_pos'].apply(lambda x: " ".join([wnl.lemmatize(word, tag) for word, tag in x]))
        # drop intermediate columns
        df=df.drop(columns=cols_ext_todrop)
    return df

### Sample 20% of the data randomly and preprocess reviews and their summaries

In [137]:
amzr_clean_sample = amzr_clean.sample(frac=0.2, replace=False, random_state=1)
start = time.time()
amzr_clean_sample = process_text(amzr_clean_sample, ['reviewText','summary'])
end = time.time()
print(end - start)

208.74141097068787


In [138]:
amzr_clean_sample.head(5)

Unnamed: 0,reviewerID,overall,unixReviewTime,reviewText,asin,reviewTime,reviewerName,helpful,summary,helpfulness,sentiment,reviewText_processed,summary_processed
471454,A1O4ZL0WU2GAGV,5.0,1369526400,"I love this game. It's colorful, imaginative, ...",B00AEJPK0C,"05 26, 2013",Merri Lockerby,"[1, 1]",Very addicting,5.0,3,love game colorful imaginative great replay va...,addict
139042,AXW0GZEQUPBN2,4.0,1358035200,I'm particularly happy to be able to replay ga...,B0063IH60K,"01 13, 2013",,"[0, 0]",Very good game,,3,particularly happy able replay game find corre...,good game
204419,A1Q91YOUFTME7K,2.0,1364169600,"Bought this for my 5 year old son, and he did ...",B006VJQ14I,"03 25, 2013",Cassandra Edwards,"[0, 1]",Not great for kids,,2,bought 5 year old son not enjoy way difficult ...,not great kid
679356,A2I1J6RNUV5RQZ,5.0,1391644800,My one and a half year old mastered this in li...,B00FL4EUZG,"02 6, 2014",Heather M Pederson,"[1, 1]",OMG! So fun!,5.0,3,one half year old master like 15 minute impres...,omg fun
105679,A31UAATZBRM9GG,5.0,1402012800,I LOVE the game!!! I love the sounds the fruit...,B005HSL626,"06 6, 2014",vbp36,"[2, 2]",BEST GAME EVER,5.0,3,love game love sound fruit make slice love cho...,best game ever


### Split data into training and validation set

In [140]:
reviews = amzr_clean_sample['reviewText'].values.tolist()
labels = amzr_clean_sample['sentiment'].tolist()

In [142]:
from sklearn.model_selection import train_test_split
training_sentences, validation_sentences, training_labels, validation_labels = train_test_split(reviews, labels, test_size=.2)

In [147]:
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForSequenceClassification
#Assign tokenizer object to the tokenizer class
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

In [None]:
train_encodings = tokenizer(training_sentences,
                            truncation=True,
                            padding=True)
val_encodings = tokenizer(validation_sentences,
                            truncation=True,
                            padding=True)
train_dataset = tf.data.Dataset.from_tensor_slices((
                            dict(train_encodings),
                            training_labels
                            ))
val_dataset = tf.data.Dataset.from_tensor_slices((
                            dict(val_encodings),
                            validation_labels
                            ))

In [None]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=3)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5, epsilon=1e-08)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])
model.fit(train_dataset.shuffle(100).batch(16),
          epochs=2,
          batch_size=16,
          validation_data=val_dataset.shuffle(100).batch(16))

In [None]:
model.save_pretrained("./sentiment")
loaded_model = TFDistilBertForSequenceClassification.from_pretrained("./sentiment")

In [122]:
df = amzr_clean.head(10000).copy()
cols = ['reviewText','summary']
cols_ext_todrop = ['nocontract','nocontract_str','tokenized','lower','no_punc','stopwords_removed','pos_tags','wordnet_pos']
# Remove contractions
for c in cols:
    df['nocontract'] = df[c].apply(lambda x: [contractions.fix(word) for word in x.split()])
    df['nocontract_str'] = [' '.join(map(str, l)) for l in df['nocontract']]
    # Tokenize
    df['tokenized'] = df['nocontract_str'].apply(word_tokenize)
    # make lower-case
    df['lower'] = df['tokenized'].apply(lambda x: [word.lower() for word in x])
    # remove punctuation
    punc = string.punctuation
    df['no_punc'] = df['lower'].apply(lambda x: [word for word in x if word not in punc])
    # remove stop-words
    stop_words = set(stopwords.words('english'))
    df['stopwords_removed'] = df['no_punc'].apply(lambda x: [word for word in x if word not in stop_words])
    # tag words
    df['pos_tags'] = df['stopwords_removed'].apply(nltk.tag.pos_tag)
    # lemmatize
    def get_wordnet_pos(tag):
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN
    df['wordnet_pos'] = df['pos_tags'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])
    wnl = WordNetLemmatizer()
    df[c+'_processed'] = df['wordnet_pos'].apply(lambda x: " ".join([wnl.lemmatize(word, tag) for word, tag in x]))
    # drop intermediate columns
    df=df.drop(columns=cols_ext_todrop)

Unnamed: 0,reviewerID,overall,unixReviewTime,reviewText,asin,reviewTime,reviewerName,helpful,summary,helpfulness,reviewText_processed,summary_processed
0,A1N4O8VOJZTDVB,3.0,1383350400,"Loves the song, so he really couldn't wait to ...",B004A9SDD8,"11 2, 2013",Annette Yancey,"[1, 1]",Really cute,5,Loves song really could wait play A little les...,Really cute
1,A2HQWU6HUKIEC7,5.0,1323043200,"Oh, how my little grandson loves this app. He'...",B004A9SDD8,"12 5, 2011","Audiobook lover ""Kathy""","[0, 0]",2-year-old loves it,,Oh little grandson love app always ask `` Monk...,2-year-old love
2,A1SXASF6GYG96I,5.0,1337558400,I found this at a perfect time since my daught...,B004A9SDD8,"05 21, 2012",Barbara Gibbs,"[0, 0]",Fun game,,I find perfect time since daughter 's favorite...,Fun game
3,A2B54P9ZDYH167,5.0,1354752000,My 1 year old goes back to this game over and ...,B004A9SDD8,"12 6, 2012","Brooke Greenstreet ""Babylove""","[3, 4]",We love our Monkeys!,4,My 1 year old go back game It simple easy todd...,We love Monkeys
4,AFOFZDTX5UC6D,5.0,1391212800,There are three different versions of the song...,B004A9SDD8,"02 1, 2014",C. Galindo,"[1, 1]",This is my granddaughters favorite app on my K...,5,There three different version song The game ke...,This granddaughters favorite app Kindle
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,AQWEAK4VJU5ZR,5.0,1344124800,This is a great app for logging your food inta...,B004H6WTJI,"08 5, 2012",,"[0, 0]",Great App!!,,This great app log food intake exercise It hel...,Great App
9996,A3V3UMS79UTFFG,4.0,1344556800,This is a great app for tracking food. That's ...,B004H6WTJI,"08 10, 2012",,"[0, 0]",Food tracking solves migrain trigger,,This great app track food something doctor try...,Food track solves migrain trigger
9997,A13SC4R3K4C8P2,5.0,1329955200,This is a great app. I downloaded it in Januar...,B004H6WTJI,"02 23, 2012",,"[0, 0]",Best APP Really helps with calorie awareness,,This great app I download January I become awa...,Best APP Really help calorie awareness
9998,AQ2Q9S0Y9YAHA,5.0,1342656000,This is a great app! I highly recommend this....,B004H6WTJI,"07 19, 2012",,"[0, 0]",Great!,,This great app I highly recommend If honest pu...,Great


In [None]:
amzr_clean.head(5)

In [67]:
### checka

In [70]:
len(set(amzr_df['reviewerID']))

87271

In [71]:
len(set(amzr_df['asin']))

13209