# Predicting the Helpfulness of Amazon Reviews - Electronics Category

## Import Libraries

In [1]:
# load packages
import gzip
import json
import os
import wget

import random
import string

import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
# this nltk download may be needed. download stopwords, punkt
# nltk.download()

import sklearn
from sklearn import preprocessing
from sklearn.feature_extraction.text import *
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Load and Prepare Dataset

In [2]:
# load dataset - download directly from source, save to data directory

file_name = "data/reviews_Electronics_5.json.gz"
output_dir = "data"
url = "http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz"

if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

if not os.path.isfile(file_name):
    file_name = wget.download(url, out=output_dir)

In [3]:
# helper functions to parse data from compressed json into pandas DF
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def get_dataframe(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')


# helper function to pull out total helpful votes
def get_helpful_votes(helpful):
    [helpful, total] = helpful
    return helpful


# helper function to pull out total votes (helpful and unhelpful)
def get_total_votes(helpful):
    [helpful, total] = helpful
    return total
    
    
# helper function to calculate helpfulness percentage 
def calculate_helpful_perc(helpful):
    [helpful, total] = helpful
    if total == 0:
        return 0
    else:
        return (helpful/total)

In [4]:
# create dataframe
df = get_dataframe(file_name)

df_length = len(df.index)

print(df_length)

MemoryError: 

In [None]:
# create dataframe
df = get_dataframe(file_name)

# parse helpful column into new columns of helpful_votes, total_votes, helpful_perc
df['helpful_votes'] = df['helpful'].apply(get_helpful_votes)
df['total_votes'] = df['helpful'].apply(get_total_votes)
df['helpful_perc'] = df['helpful'].apply(calculate_helpful_perc)

## Exploratory Data Analysis

### Data Insights

In [None]:
# take a look at the shape of the data
df.shape

In [None]:
# take a look at a sample of rows
df.sample(5)

In [None]:
# see if there are any missing values by feature
# missing values show up in yellow
sns.heatmap(df.isnull(),cbar=False,yticklabels=False,cmap = 'viridis')

In [None]:
# calculate how many values are missing by feature
missing_df = pd.DataFrame(df.isnull().sum())
missing_df

### Summary Statistics

In [None]:
# take a look at summary statistics of dataset
df.describe()

### Examining the Target Variable

In [None]:
# look at distribution of helpful_perc
fig1 = plt.figure()
ax1 = fig1.add_subplot(1, 1, 1)
n, bins, patches = ax1.hist(x=df['helpful_perc'], bins='auto')
                                 
ax1.set_xlabel('Helpful Perc')
ax1.set_ylabel('Frequency')
ax1.set_title('Histogram of Helpful Percentages')

### Data Cleaning

In [None]:
# subset df of reviews that have at least three votes
df_three = df[df.total_votes >= 3]

# subset df of reviews that have two both and both are in agreeement
df_two = df[((df.total_votes == 2) & (df.helpful_perc == 1)) | ((df.total_votes == 2) & (df.helpful_perc == 0))]

# combine the dfs back together
df = df_three.append(df_two, ignore_index=True)
df.shape

In [None]:
# reduce size of df
df = df.sample(80000)

In [None]:
df.describe()

In [None]:
fig2 = plt.figure()
ax2 = fig2.add_subplot(1, 1, 1)
n, bins, patches = ax2.hist(x=df['helpful_perc'], bins='auto')
                                 
ax2.set_xlabel('Helpful Perc')
ax2.set_ylabel('Frequency')
ax2.set_title('Histogram of Helpful Percentages')

## Model 1: Helpful Reviews Are Determined Randomly

In [None]:
# calculate total number of reviews
total_reviews = len(df)

# to repeat results
random.seed(12345)

helpful_perc = df['helpful_perc']

random_helpful = []
for i in range(total_reviews):
    random_helpful.append(random.random())
    

np.corrcoef(helpful_perc, random_helpful)[1,0]

## Model 2: Naive Bayes Bag of Words Model

### Data Pre-Processing

In [None]:
# ensure there are no missing values in the review or summary text or the overall helpullness percentage
len(df) == len(df.dropna(subset=['reviewText', 'summary', 'helpful_perc']))

In [None]:
# Combine the reviewText and summary text fields to combine into a single "document" with which to analyze

stop_words = set(stopwords.words('english'))

def tokenize(text):
    return [word for word in word_tokenize(text) if not word in stop_words]

In [None]:
# We should then tokenize and stem the review data before ingesting into our NLP models
df['combinedText'] = df['summary'].str.lower() + ". " +  df['reviewText'].str.lower()

# Now tokenize these and remove stop words
df['processedText'] = df['combinedText'].apply(tokenize)

In [None]:
# preprocess the data
data = df[['combinedText', 'helpful_perc']]

le = preprocessing.LabelEncoder()
preprocessed_data = data.apply(le.fit_transform)

### Split into Train/Test

In [None]:
Train_X, Test_X, Train_Y, Test_Y = train_test_split(preprocessed_data['combinedText'], 
                                                    preprocessed_data['helpful_perc'],
                                                    random_state = 12345, # reproduce results
                                                    test_size=0.2)

### Instantiate and Train the Model

In [None]:
# instantiate model
regr = linear_model.LinearRegression()

# train the model using the training sets
regr.fit(np.array(Train_X.values.tolist()).reshape(-1, 1), Train_Y)

### Predict Test Labels and Score

In [None]:
# predict values
pred = regr.predict(np.array(Test_X.values.tolist()).reshape(-1, 1))

# score with correlation coefficient
np.corrcoef(Test_Y,pred)[1,0]

## Model 3: TF-IDF and Linear Regression

### Split Into Train/Test

In [None]:
# split into train//test sets
data = df[['combinedText', 'helpful_perc']]

df_train, df_test = train_test_split(data, test_size = 0.2, random_state = 12345)

### Data Pre-Processing

In [None]:
# stem combinedText to reduce size of corpus
porter=PorterStemmer()

def stem_text(df):
    text_list = df['combinedText'].tolist()
    text_list_stem = [None] * len(text_list)

    for i in range(len(text_list)):
        text_list_stem[i] = ' '.join([porter.stem(w) for w in text_list[i].split()])
    
    return text_list_stem

text_list_stem = stem_text(data)

In [None]:
# vectorize text
vectorizer = TfidfVectorizer(
                max_features=100,
                ngram_range=(1,1)
                )

vectorizer.fit_transform(text_list_stem).toarray()

In [None]:
# vectorize train dataset
train_text_stem = stem_text(df_train)
train_vectorized = vectorizer.transform(train_text_stem).toarray()

print('Shape:', train_vectorized.shape)

In [None]:
# vectorize test dataset
test_text_stem = stem_text(df_test)
test_vectorized = vectorizer.transform(test_text_stem).toarray()

print('Shape:', test_vectorized.shape)

In [None]:
# create array of labels to use in linear regression
df_train_labels = np.array(df_train['helpful_perc'])
df_test_labels = np.array(df_test['helpful_perc'])

### Instantiate and Train the Model

In [None]:
# instantiate model
regr_2 = linear_model.LinearRegression()

# train the model using the training sets
regr_2.fit(np.array(train_vectorized.tolist()), df_train_labels)

### Predict Test Labels and Score

In [None]:
# predict values
pred_2 = regr_2.predict(np.array(test_vectorized.tolist()))

# score with correlation coefficient
np.corrcoef(df_test_labels, pred_2)[1,0]

## Model 4: BERT