In [47]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import gzip

import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, sent_tokenize 
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string

import dask.dataframe as dd
from dask.multiprocessing import get
import textstat
import swifter

import ast
import math

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb


In [3]:
# GET ALL THE DATAFRAMES :-
# 1.NORMAL REVIEWS
# 2.REVIEW TEXT Characteristics
# 3.USER Charecteristics
# 4.REVIEW METADATA Characteristics
# 5.PRODUCT METADATA

review_df = pd.read_csv('Amazon_Latest_Data.csv')
text_df = pd.read_csv('Text_Parameters.csv')
user_df = pd.read_csv('User_DF.csv')
meta_df = pd.read_csv('Review_Meta_Data.csv')
product_data_df = pd.read_csv('meta_data_latest.csv')

In [5]:
X = text_df[['flesch_reading_ease', 
       'smog_index', 'flesch_kincaid_grade', 'coleman_liau_index',
       'automated_readability_index', 'dale_chall_readability_score',
       'difficult_words', 'linsear_write_formula', 'gunning_fog',
       'sentence_count', 'wps', 'review_length', 'pos_no', 'neg_no']]

X=X.join(user_df[['user_deviation','user_delay']])

X = X.join(meta_df[['stem_sim_length','lem_sim_length','overall']])

X.columns

Index(['flesch_reading_ease', 'smog_index', 'flesch_kincaid_grade',
       'coleman_liau_index', 'automated_readability_index',
       'dale_chall_readability_score', 'difficult_words',
       'linsear_write_formula', 'gunning_fog', 'sentence_count', 'wps',
       'review_length', 'pos_no', 'neg_no', 'user_deviation', 'user_delay',
       'stem_sim_length', 'lem_sim_length', 'overall'],
      dtype='object')

In [6]:
X.head()

Unnamed: 0,flesch_reading_ease,smog_index,flesch_kincaid_grade,coleman_liau_index,automated_readability_index,dale_chall_readability_score,difficult_words,linsear_write_formula,gunning_fog,sentence_count,wps,review_length,pos_no,neg_no,user_deviation,user_delay,stem_sim_length,lem_sim_length,overall
0,-336.56,0.0,166.3,7.69,211.1,26.44,44,14.0,176.92,1,427.0,427,2,4,1.399189,10152000.0,15,13,1.0
1,-340.96,0.0,165.9,9.72,210.8,26.37,94,8.666667,175.64,2,423.0,846,18,8,0.624437,10886400.0,24,23,3.0
2,85.52,7.3,6.2,5.92,7.2,5.87,36,7.666667,13.01,23,19.521739,449,7,6,0.510375,7516800.0,17,16,2.0
3,66.94,10.8,7.1,8.05,6.3,6.99,11,7.2,14.0,5,12.8,64,2,0,0.442171,2678400.0,2,2,5.0
4,-43.22,0.0,53.6,8.09,67.2,12.2,15,4.4,61.55,1,138.0,138,1,3,1.616074,7603200.0,2,2,1.0


In [9]:
y = review_df['helpfulness_score']
	
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=101)

In [10]:
xgb_reg = xgb.XGBRegressor()

In [11]:
xgb_reg.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [12]:
predictions=xgb_reg.predict(X_test)

In [13]:
# MSE : Mean Squared Error
mse=mean_squared_error(y_test,predictions)

# RMSE : Root Mean Squared Error
rmse=math.sqrt(mean_squared_error(y_test,predictions))

# MAE : Mean Absolute Error
mae=mean_absolute_error(y_test,predictions)

print('Mean Squared Error (MSE):      ',mse)
print('Root Mean Squared Error (RMSE):',rmse)
print('Mean Absolute Error (MAE):     ',mae)

Mean Squared Error (MSE):       0.03734678330053756
Root Mean Squared Error (RMSE): 0.19325315857842418
Mean Absolute Error (MAE):      0.13325926594083998


In [15]:
xgb_reg2 = xgb.XGBRegressor(n_estimators=300)

In [16]:
xgb_reg2.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=300,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [17]:
predictions_2 = xgb_reg2.predict(X_test)

In [18]:
# MSE : Mean Squared Error
mse=mean_squared_error(y_test,predictions_2)

# RMSE : Root Mean Squared Error
rmse=math.sqrt(mean_squared_error(y_test,predictions_2))

# MAE : Mean Absolute Error
mae=mean_absolute_error(y_test,predictions_2)

print('Mean Squared Error (MSE):      ',mse)
print('Root Mean Squared Error (RMSE):',rmse)
print('Mean Absolute Error (MAE):     ',mae)

Mean Squared Error (MSE):       0.03729544118492501
Root Mean Squared Error (RMSE): 0.19312027647278524
Mean Absolute Error (MAE):      0.13301010700640992


In [20]:
print(X.columns)

Index(['flesch_reading_ease', 'smog_index', 'flesch_kincaid_grade',
       'coleman_liau_index', 'automated_readability_index',
       'dale_chall_readability_score', 'difficult_words',
       'linsear_write_formula', 'gunning_fog', 'sentence_count', 'wps',
       'review_length', 'pos_no', 'neg_no', 'user_deviation', 'user_delay',
       'stem_sim_length', 'lem_sim_length', 'overall'],
      dtype='object')


In [10]:
# EXPERIMENT - Using XGB Grid Search 
# PARAMETERS : ALL
X = text_df[['flesch_reading_ease', 
       'smog_index', 'flesch_kincaid_grade', 'coleman_liau_index',
       'automated_readability_index', 'dale_chall_readability_score',
       'difficult_words', 'linsear_write_formula', 'gunning_fog',
       'sentence_count', 'wps', 'review_length', 'pos_no', 'neg_no']]

X=X.join(user_df[['user_deviation','user_delay']])

X = X.join(meta_df[['stem_sim_length','lem_sim_length','overall']])

y = review_df['helpfulness_score']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=101)

In [31]:
gbm_param_grid = {
    'learning_rate': [0.01,0.09,0.1,0.2,0.5,0.9], 
    'n_estimators': [200,300,400,500], 
    'subsample': [0.3, 0.5, 0.9, 1]}

In [13]:
gbm_param_grid_2={
    'learning_rate':[0.1]
}

In [32]:
gbm = xgb.XGBRegressor()

In [34]:
grid_mse = GridSearchCV(estimator=gbm,
param_grid=gbm_param_grid,
scoring='neg_mean_squared_error', verbose=3)


In [35]:
grid_mse.fit(X_train,y_train)

Fitting 3 folds for each of 96 candidates, totalling 288 fits
[CV] learning_rate=0.01, n_estimators=200, subsample=0.3 .............
[CV]  learning_rate=0.01, n_estimators=200, subsample=0.3, score=-0.040026338881381796, total=   7.0s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.2s remaining:    0.0s


[CV] learning_rate=0.01, n_estimators=200, subsample=0.3 .............
[CV]  learning_rate=0.01, n_estimators=200, subsample=0.3, score=-0.040852204020579076, total=   6.7s


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   14.2s remaining:    0.0s


[CV] learning_rate=0.01, n_estimators=200, subsample=0.3 .............
[CV]  learning_rate=0.01, n_estimators=200, subsample=0.3, score=-0.039909483150939146, total=   6.7s
[CV] learning_rate=0.01, n_estimators=200, subsample=0.5 .............
[CV]  learning_rate=0.01, n_estimators=200, subsample=0.5, score=-0.040105247873757376, total=   7.8s
[CV] learning_rate=0.01, n_estimators=200, subsample=0.5 .............
[CV]  learning_rate=0.01, n_estimators=200, subsample=0.5, score=-0.04093653271932638, total=   8.3s
[CV] learning_rate=0.01, n_estimators=200, subsample=0.5 .............
[CV]  learning_rate=0.01, n_estimators=200, subsample=0.5, score=-0.0399916977857508, total=   7.9s
[CV] learning_rate=0.01, n_estimators=200, subsample=0.9 .............
[CV]  learning_rate=0.01, n_estimators=200, subsample=0.9, score=-0.04022534563526472, total=   7.3s
[CV] learning_rate=0.01, n_estimators=200, subsample=0.9 .............
[CV]  learning_rate=0.01, n_estimators=200, subsample=0.9, score=-0.

[CV]  learning_rate=0.09, n_estimators=200, subsample=0.3, score=-0.03799831901490319, total=   6.7s
[CV] learning_rate=0.09, n_estimators=200, subsample=0.3 .............
[CV]  learning_rate=0.09, n_estimators=200, subsample=0.3, score=-0.03716036499011589, total=   6.8s
[CV] learning_rate=0.09, n_estimators=200, subsample=0.5 .............
[CV]  learning_rate=0.09, n_estimators=200, subsample=0.5, score=-0.036843216589253415, total=   7.7s
[CV] learning_rate=0.09, n_estimators=200, subsample=0.5 .............
[CV]  learning_rate=0.09, n_estimators=200, subsample=0.5, score=-0.03789747021327108, total=   7.8s
[CV] learning_rate=0.09, n_estimators=200, subsample=0.5 .............
[CV]  learning_rate=0.09, n_estimators=200, subsample=0.5, score=-0.037096314163439545, total=   7.8s
[CV] learning_rate=0.09, n_estimators=200, subsample=0.9 .............
[CV]  learning_rate=0.09, n_estimators=200, subsample=0.9, score=-0.03688846453169451, total=   7.6s
[CV] learning_rate=0.09, n_estimators

[CV] learning_rate=0.1, n_estimators=200, subsample=0.3 ..............
[CV]  learning_rate=0.1, n_estimators=200, subsample=0.3, score=-0.03807052710000134, total=  15.2s
[CV] learning_rate=0.1, n_estimators=200, subsample=0.3 ..............
[CV]  learning_rate=0.1, n_estimators=200, subsample=0.3, score=-0.03728413358445416, total=  14.3s
[CV] learning_rate=0.1, n_estimators=200, subsample=0.5 ..............
[CV]  learning_rate=0.1, n_estimators=200, subsample=0.5, score=-0.03699815011449566, total=  16.7s
[CV] learning_rate=0.1, n_estimators=200, subsample=0.5 ..............
[CV]  learning_rate=0.1, n_estimators=200, subsample=0.5, score=-0.03791231919948638, total=  16.9s
[CV] learning_rate=0.1, n_estimators=200, subsample=0.5 ..............
[CV]  learning_rate=0.1, n_estimators=200, subsample=0.5, score=-0.0370054359214847, total=  16.8s
[CV] learning_rate=0.1, n_estimators=200, subsample=0.9 ..............
[CV]  learning_rate=0.1, n_estimators=200, subsample=0.9, score=-0.03692523

[CV] learning_rate=0.2, n_estimators=200, subsample=0.3 ..............
[CV]  learning_rate=0.2, n_estimators=200, subsample=0.3, score=-0.03899094033069113, total=  14.2s
[CV] learning_rate=0.2, n_estimators=200, subsample=0.3 ..............
[CV]  learning_rate=0.2, n_estimators=200, subsample=0.3, score=-0.03811253389021847, total=  14.1s
[CV] learning_rate=0.2, n_estimators=200, subsample=0.5 ..............
[CV]  learning_rate=0.2, n_estimators=200, subsample=0.5, score=-0.03758389137826154, total=  17.0s
[CV] learning_rate=0.2, n_estimators=200, subsample=0.5 ..............
[CV]  learning_rate=0.2, n_estimators=200, subsample=0.5, score=-0.038472747185366446, total=  16.7s
[CV] learning_rate=0.2, n_estimators=200, subsample=0.5 ..............
[CV]  learning_rate=0.2, n_estimators=200, subsample=0.5, score=-0.037595069655922006, total=  16.8s
[CV] learning_rate=0.2, n_estimators=200, subsample=0.9 ..............
[CV]  learning_rate=0.2, n_estimators=200, subsample=0.9, score=-0.03730

[CV]  learning_rate=0.5, n_estimators=200, subsample=0.3, score=-0.04246030819838963, total=   7.4s
[CV] learning_rate=0.5, n_estimators=200, subsample=0.3 ..............
[CV]  learning_rate=0.5, n_estimators=200, subsample=0.3, score=-0.043204127956869745, total=   7.4s
[CV] learning_rate=0.5, n_estimators=200, subsample=0.3 ..............
[CV]  learning_rate=0.5, n_estimators=200, subsample=0.3, score=-0.042750784502054905, total=   7.1s
[CV] learning_rate=0.5, n_estimators=200, subsample=0.5 ..............
[CV]  learning_rate=0.5, n_estimators=200, subsample=0.5, score=-0.040526775055601066, total=   8.7s
[CV] learning_rate=0.5, n_estimators=200, subsample=0.5 ..............
[CV]  learning_rate=0.5, n_estimators=200, subsample=0.5, score=-0.04177570195442108, total=   8.3s
[CV] learning_rate=0.5, n_estimators=200, subsample=0.5 ..............
[CV]  learning_rate=0.5, n_estimators=200, subsample=0.5, score=-0.04088560276483174, total=   8.2s
[CV] learning_rate=0.5, n_estimators=200, 

[CV] learning_rate=0.9, n_estimators=200, subsample=0.3 ..............
[CV]  learning_rate=0.9, n_estimators=200, subsample=0.3, score=-0.05642660379298668, total=   7.1s
[CV] learning_rate=0.9, n_estimators=200, subsample=0.3 ..............
[CV]  learning_rate=0.9, n_estimators=200, subsample=0.3, score=-0.06076806000671347, total=   6.8s
[CV] learning_rate=0.9, n_estimators=200, subsample=0.3 ..............
[CV]  learning_rate=0.9, n_estimators=200, subsample=0.3, score=-0.06252745885815401, total=   7.6s
[CV] learning_rate=0.9, n_estimators=200, subsample=0.5 ..............
[CV]  learning_rate=0.9, n_estimators=200, subsample=0.5, score=-0.046506480924872785, total=   8.9s
[CV] learning_rate=0.9, n_estimators=200, subsample=0.5 ..............
[CV]  learning_rate=0.9, n_estimators=200, subsample=0.5, score=-0.048291204061816376, total=   9.0s
[CV] learning_rate=0.9, n_estimators=200, subsample=0.5 ..............
[CV]  learning_rate=0.9, n_estimators=200, subsample=0.5, score=-0.04779

[Parallel(n_jobs=1)]: Done 288 out of 288 | elapsed: 94.9min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'learning_rate': [0.01, 0.09, 0.1, 0.2, 0.5, 0.9], 'n_estimators': [200, 300, 400, 500], 'subsample': [0.3, 0.5, 0.9, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=3)

In [36]:
grid_mse.best_params_

{'learning_rate': 0.09, 'n_estimators': 200, 'subsample': 1}

In [37]:
preds=grid_mse.predict(X_test)

In [38]:
# MSE : Mean Squared Error
mse=mean_squared_error(y_test,preds)

# RMSE : Root Mean Squared Error
rmse=math.sqrt(mean_squared_error(y_test,preds))

# MAE : Mean Absolute Error
mae=mean_absolute_error(y_test,preds)

print('Mean Squared Error (MSE):      ',mse)
print('Root Mean Squared Error (RMSE):',rmse)
print('Mean Absolute Error (MAE):     ',mae)

Mean Squared Error (MSE):       0.037275719683514515
Root Mean Squared Error (RMSE): 0.19306920956878265
Mean Absolute Error (MAE):      0.13301938358256804


In [None]:
For :
    'learning_rate': [0.01,0.1,0.5,0.9], 
    'n_estimators': [200], 
    'subsample': [0.3, 0.5, 0.9]

Results
Mean Squared Error (MSE):       0.03727120568638051
Root Mean Squared Error (RMSE): 0.1930575191138136
Mean Absolute Error (MAE):      0.13302424716098063

In [None]:
For :
    'learning_rate': [0.01,0.09,0.1,0.2,0.5,0.9], 
    'n_estimators': [200,300,400,500], 
    'subsample': [0.3, 0.5, 0.9, 1]
    
{'learning_rate': 0.09, 'n_estimators': 200, 'subsample': 1}

Results
Mean Squared Error (MSE):       0.037275719683514515
Root Mean Squared Error (RMSE): 0.19306920956878265
Mean Absolute Error (MAE):      0.13301938358256804
    

In [None]:
For :
    'learning_rate': 0.1
    'n_estimators': 300
    'subsample':1
        
Results
Mean Squared Error (MSE):       0.03729544118492501
Root Mean Squared Error (RMSE): 0.19312027647278524
Mean Absolute Error (MAE):      0.13301010700640992
    

In [43]:
gbm = xgb.XGBRegressor(learning_rate=0.09, n_estimators=200)

In [44]:
gbm.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.09, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=200,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [45]:
preds2 = gbm.predict(X_test)

In [46]:
# MSE : Mean Squared Error
mse=mean_squared_error(y_test,preds2)

# RMSE : Root Mean Squared Error
rmse=math.sqrt(mean_squared_error(y_test,preds2))

# MAE : Mean Absolute Error
mae=mean_absolute_error(y_test,preds2)

print('Mean Squared Error (MSE):      ',mse)
print('Root Mean Squared Error (RMSE):',rmse)
print('Mean Absolute Error (MAE):     ',mae)

Mean Squared Error (MSE):       0.037275719683514515
Root Mean Squared Error (RMSE): 0.19306920956878265
Mean Absolute Error (MAE):      0.13301938358256804


In [None]:
#EXPERIMENT : RANDOMIZED SEARCH based on the params of Grid Search


In [None]:
gbm_param_grid = {
...: 'learning_rate': np.arange(0.05,1.05,.05),
...: 'n_estimators': [200],
...: 'subsample': np.arange(0.05,1.05,.05)}

In [None]:
gbm_param_grid = {
    'learning_rate': np.arange(0.07,0.11,0.005),
    'n_estimators': [200,225,250,275,300]
}

In [59]:
np.arange(0.7,1.05,0.05)

array([0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  , 1.05])