# Helpfulness Prediction - DEMO

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import gzip

import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, sent_tokenize 
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string

import dask.dataframe as dd
from dask.multiprocessing import get
import textstat
import swifter

import ast
import math

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

import xgboost as xgb

In [2]:
# GET ALL THE DATAFRAMES :-
# 1.NORMAL REVIEWS
# 2.REVIEW TEXT Characteristics
# 3.USER Charecteristics
# 4.REVIEW METADATA Characteristics
# 5.PRODUCT METADATA

review_df = pd.read_csv('Amazon_Latest_Data.csv')
text_df = pd.read_csv('Text_Parameters.csv')
user_df = pd.read_csv('User_DF.csv')
meta_df = pd.read_csv('Review_Meta_Data.csv')
product_data_df = pd.read_csv('meta_data_latest.csv')

In [3]:
df = text_df[['flesch_reading_ease', 
       'smog_index', 'flesch_kincaid_grade', 'coleman_liau_index',
       'automated_readability_index', 'dale_chall_readability_score',
       'difficult_words', 'linsear_write_formula', 'gunning_fog',
       'sentence_count', 'wps', 'review_length', 'pos_no', 'neg_no']]

df=df.join(user_df[['user_deviation','user_delay', 'no_of_reviews', 'reviewer_days']])

df = df.join(meta_df[['stem_sim_length','lem_sim_length','overall']])

df = df.join(review_df['helpfulness_score'])

df.columns

Index(['flesch_reading_ease', 'smog_index', 'flesch_kincaid_grade',
       'coleman_liau_index', 'automated_readability_index',
       'dale_chall_readability_score', 'difficult_words',
       'linsear_write_formula', 'gunning_fog', 'sentence_count', 'wps',
       'review_length', 'pos_no', 'neg_no', 'user_deviation', 'user_delay',
       'no_of_reviews', 'reviewer_days', 'stem_sim_length', 'lem_sim_length',
       'overall', 'helpfulness_score'],
      dtype='object')

In [4]:
X = df.sample(n=1000)

In [5]:
y = X['helpfulness_score']
X.drop(['helpfulness_score'],axis=1, inplace=True)

In [6]:
X.head()

Unnamed: 0,flesch_reading_ease,smog_index,flesch_kincaid_grade,coleman_liau_index,automated_readability_index,dale_chall_readability_score,difficult_words,linsear_write_formula,gunning_fog,sentence_count,...,review_length,pos_no,neg_no,user_deviation,user_delay,no_of_reviews,reviewer_days,stem_sim_length,lem_sim_length,overall
19122,-69.28,0.0,59.4,10.64,73.8,13.61,25,7.0,67.6,1,...,148,4,2,1.11189,22953600.0,1,1147.0,3,3,2.0
15521,56.59,0.0,11.1,10.51,12.4,7.52,4,6.75,18.16,1,...,23,2,1,0.84195,21681000.0,7,2444.0,1,1,2.0
48266,85.69,8.3,4.0,5.61,3.6,6.34,9,5.333333,11.85,6,...,66,3,0,0.791677,30672000.0,0,0.0,1,1,5.0
24443,-9.05,0.0,38.4,9.31,47.2,10.04,20,6.125,44.57,2,...,192,5,3,0.872324,0.0,1,0.0,0,0,2.0
77600,91.0,7.2,4.1,6.08,5.1,6.01,12,6.714286,11.89,8,...,115,0,4,1.854145,0.0,1,0.0,2,2,1.0


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=101)

## LINEAR REGRESSION


In [8]:
lm = LinearRegression()

lm.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [9]:
predictions = lm.predict(X_test)

In [10]:
# MSE : Mean Squared Error
mse=mean_squared_error(y_test,predictions)

# RMSE : Root Mean Squared Error
rmse=math.sqrt(mean_squared_error(y_test,predictions))

# MAE : Mean Absolute Error
mae=mean_absolute_error(y_test,predictions)

print('Mean Squared Error (MSE):      ',mse)
print('Root Mean Squared Error (RMSE):',rmse)
print('Mean Absolute Error (MAE):     ',mae)

Mean Squared Error (MSE):       0.04147901640781651
Root Mean Squared Error (RMSE): 0.20366397916130508
Mean Absolute Error (MAE):      0.1401628869724623


## RANDOM FORESTS REGRESSION

In [11]:
#RANDOM FOREST

regressor = RandomForestRegressor(n_estimators=500, random_state=42)

In [12]:
regressor.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [13]:
preds_2 = regressor.predict(X_test)

In [15]:
# MSE : Mean Squared Error
mse=mean_squared_error(y_test,preds_2)

# RMSE : Root Mean Squared Error
rmse=math.sqrt(mean_squared_error(y_test,preds_2))

# MAE : Mean Absolute Error
mae=mean_absolute_error(y_test,preds_2)

print('Mean Squared Error (MSE):      ',mse)
print('Root Mean Squared Error (RMSE):',rmse)
print('Mean Absolute Error (MAE):     ',mae)

Mean Squared Error (MSE):       0.041183058024052116
Root Mean Squared Error (RMSE): 0.20293609344828759
Mean Absolute Error (MAE):      0.14035895388769914


## Extreme Gradient Boosting - XGBoost

In [21]:
xgb_reg = xgb.XGBRegressor()

In [22]:
xgb_reg.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [23]:
predictions_xgb=xgb_reg.predict(X_test)

In [24]:
# MSE : Mean Squared Error
mse=mean_squared_error(y_test,preds_2)

# RMSE : Root Mean Squared Error
rmse=math.sqrt(mean_squared_error(y_test,preds_2))

# MAE : Mean Absolute Error
mae=mean_absolute_error(y_test,preds_2)

print('Mean Squared Error (MSE):      ',mse)
print('Root Mean Squared Error (RMSE):',rmse)
print('Mean Absolute Error (MAE):     ',mae)

Mean Squared Error (MSE):       0.041183058024052116
Root Mean Squared Error (RMSE): 0.20293609344828759
Mean Absolute Error (MAE):      0.14035895388769914


In [25]:
params = {
    'learning_rate': [0.01,0.1,0.9], 
    'n_estimators': [200,300,400], 
    'subsample': [0.3, 0.9, 1]}

In [27]:
result_df = pd.DataFrame(columns=['learning_rate','n_estimators', 'subsample', 'rmse', 'mae'])

In [28]:
best_rmse=1000
best_mae=1000
for rate in params['learning_rate'] :
    for estimator in params['n_estimators'] :
        for subsample in params['subsample'] :
            xgb_model = xgb.XGBRegressor(learning_rate= rate, n_estimators = estimator, subsample=subsample)
            xgb_model.fit(X_train,y_train)
            predictions=xgb_model.predict(X_test)
            rmse=math.sqrt(mean_squared_error(y_test,predictions))
            mae=mean_absolute_error(y_test,predictions)
            
            res_dict = {'learning_rate': rate, 'n_estimators': estimator, 'subsample': subsample, 'rmse' : rmse, 'mae' : mae }
            result_df.append(res_dict, ignore_index=True)

            print(res_dict)
            
            if(rmse<=best_rmse) :
                if(mae < best_mae) :
                    best_rmse = rmse
                    best_mae = mae
                    best_learning_rate = rate
                    best_n_estimators = estimator
                    best_subsample = subsample
            

{'learning_rate': 0.01, 'n_estimators': 200, 'subsample': 0.3, 'rmse': 0.20512789595982872, 'mae': 0.15723240020805554}
{'learning_rate': 0.01, 'n_estimators': 200, 'subsample': 0.9, 'rmse': 0.20708866259044995, 'mae': 0.15948471976949066}
{'learning_rate': 0.01, 'n_estimators': 200, 'subsample': 1, 'rmse': 0.20764850876155144, 'mae': 0.16021415668251293}
{'learning_rate': 0.01, 'n_estimators': 300, 'subsample': 0.3, 'rmse': 0.20136746259997856, 'mae': 0.14293331747968407}
{'learning_rate': 0.01, 'n_estimators': 300, 'subsample': 0.9, 'rmse': 0.2039217081252845, 'mae': 0.14603221274112446}
{'learning_rate': 0.01, 'n_estimators': 300, 'subsample': 1, 'rmse': 0.2048034518254528, 'mae': 0.14775969914137263}
{'learning_rate': 0.01, 'n_estimators': 400, 'subsample': 0.3, 'rmse': 0.20249919700440724, 'mae': 0.13953698300673614}
{'learning_rate': 0.01, 'n_estimators': 400, 'subsample': 0.9, 'rmse': 0.20539744575365454, 'mae': 0.14305194878849783}
{'learning_rate': 0.01, 'n_estimators': 400, '

In [31]:
print("Best RMSE: ",best_rmse)
print("Best MAE: ",best_mae)
print("Parameters:-")
print("Learning Rate: {} , No. of Estimators: {} , Subsample Size: {}".format(best_learning_rate, best_n_estimators, best_subsample))

Best RMSE:  0.20136746259997856
Best MAE:  0.14293331747968407
Parameters:-
Learning Rate: 0.01 , No. of Estimators: 300 , Subsample Size: 0.3


## STEM SIM LENGTH

In [32]:
def get_desc_words(text):
    nopunc= [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)

    words = [word.lower() for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    
    
    tagged = nltk.pos_tag(words) 
    
    nouns = []
    for word in tagged:
        if word[1]=='NN':
            nouns.append(word[0])

    
    ps = PorterStemmer()
    stemmed_words = []
    for w in nouns:
        stemmed_words.append(ps.stem(w))

    return stemmed_words
    

In [33]:
get_desc_words("Hello this is great. I like my new TV. Visualization is super. Low cost Installation.")

['tv', 'visual', 'cost', 'instal']

In [34]:
# get the stemmed words of  review text 
def get_review_words_stem(text):
    nopunc= [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)

    words = [word.lower() for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    
    ps = PorterStemmer()
    stemmed_words = []
    for w in words:
        stemmed_words.append(ps.stem(w))

    return stemmed_words

In [35]:
get_review_words_stem("This was easy to be installed.")

['easi', 'instal']

In [36]:
def get_stem_sim_words(desc_stem, review_stem) :
    sim_words=list(set(desc_stem).intersection(review_stem))
    # return the words
    return sim_words

In [47]:
description = "This is a TV. It's installation is easy. There is also a USB cord."
text = "I installed the TV with easily. But the USB cord given was broken"

In [48]:
desc_stem = get_desc_words(description)
review_stem = get_review_words_stem(text)

In [49]:
desc_stem

['tv', 'instal', 'easi', 'cord']

In [50]:
review_stem

['instal', 'tv', 'easili', 'usb', 'cord', 'given', 'broken']

In [51]:
get_stem_sim_words(desc_stem,review_stem)

['tv', 'instal', 'cord']