In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [2]:
gr_metadata = pd.read_csv('official_goodreads_metadata.csv')

In [3]:
am_metadata = pd.read_csv('official_amazon_metadata.csv')

In [4]:
gr_metadata.loc[gr_metadata['asin'] == '0205739415', 'total_text_reviews_count'] = 1
gr_metadata.loc[gr_metadata['asin'] == '0300084323', 'total_text_reviews_count'] = 1

In [5]:
am_gr_metadata = pd.merge(gr_metadata[['asin', 'average_rating', 'total_ratings_count', 'total_text_reviews_count', 
                                       'publication_year', 'publication_month', 'publication_day', 'format', 
                                       'cleaned_description', 'gr_countDes_before', 'gr_countDes_after', 'cleaned_genres',
                                       'gr_countText_before', 'gr_countText_after']], 
                          am_metadata, how='inner', on='asin')
am_gr_metadata

Unnamed: 0,asin,average_rating,total_ratings_count,total_text_reviews_count,publication_year,publication_month,publication_day,format,cleaned_description,gr_countDes_before,...,gr_countText_after,average,rating_count,text_reviews_count,genres,rank,verifiedTrue_count,Format,am_countText_before,am_countText_after
0,000100039X,4.23,220088,8847,2010.0,1.0,1.0,Paperback,tahsil vibrants masterpiece prophet one belove...,106.0,...,17834,4.64,1453,1453,"Literature & Fiction, Poetry",1810945,1130,,69909,31772
1,0001053655,4.08,676,85,1997.0,,,Hardcover,,,...,75,4.48,50,50,Humor & Entertainment,9799161,43,"Kindle Edition, Paperback, Hardcover, Audi...",4888,2240
2,0001061240,4.62,221,36,1959.0,12.0,1.0,Hardcover,,,...,18,4.87,45,45,"Childrens Books, Literature & Fiction",321557,30,Hardcover,3085,1326
3,000161102X,3.86,2929,75,,,,,snobby girl fashionable board school ridicule ...,47.0,...,61,4.35,17,17,Literature & Fiction,1542999,13,,788,399
4,0001711296,4.29,738,65,,,,,,,...,117,4.44,107,107,Literature & Fiction,2884610,69,"Library Binding, VHS Tape, Paperback, Hard...",5667,2574
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37228,0312953240,3.80,87,8,1995.0,7.0,15.0,Paperback,recount search trial serial killer target offr...,41.0,...,94,3.69,13,13,"Biographies & Memoirs, True Crime",443719,4,"Mass Market Paperback, Hardcover",2599,1216
37229,0312955138,3.38,29,4,1995.0,9.0,15.0,Paperback,jill coit voluptuous darkskinned beauty sultry...,156.0,...,52,3.58,12,12,"Biographies & Memoirs, True Crime",3470182,6,"Kindle Edition, Hardcover",1489,668
37230,0312955154,3.36,56,5,1995.0,10.0,15.0,Paperback,paleontologist cameron alone discover yearly m...,48.0,...,184,3.29,14,14,"Literature & Fiction, Fiction",3412599,4,"Kindle Edition, Paperback",1456,683
37231,0312956878,3.78,59,4,1995.0,10.0,15.0,Paperback,killer without redemption broad daylight backw...,187.0,...,76,3.33,15,15,"Biographies & Memoirs, True Crime",2606128,9,"Mass Market Paperback, Hardcover",968,450


In [6]:
am_gr_metadata = am_gr_metadata.rename(columns={'average_rating':'gr_average', 'total_ratings_count':'gr_ratings_count',
                                                'total_text_reviews_count':'gr_reviews_count', 
                                                'publication_year':'gr_pub_yr', 'publication_month':'gr_pub_mo', 
                                                'publication_day':'gr_pub_day', 'format':'gr_format',
                                                'cleaned_description':'gr_description', 'cleaned_genres':'gr_genres',
                                                'average':'am_average', 'rating_count':'am_ratings_count', 
                                                'text_reviews_count':'am_reviews_count', 'genres':'am_genres', 
                                                'rank':'am_rank', 'verifiedTrue_count':'am_verifiedTrue_count', 
                                                'Format':'am_format'})

In [7]:
am_gr_metadata['rating_diff'] = am_gr_metadata['am_average']- am_gr_metadata['gr_average']

In [8]:
am_gr_metadata['gr_genres']= am_gr_metadata['gr_genres'].str.replace(' ','')

In [9]:
am_gr_metadata_numeric = am_gr_metadata[['asin', 'gr_ratings_count', 'gr_reviews_count', 'gr_countText_before', 
                                         'gr_countText_after', 'am_ratings_count', 'am_reviews_count', 'am_rank', 
                                         'am_verifiedTrue_count', 'am_countText_before', 'am_countText_after', 
                                         'rating_diff']]

In [10]:
am_gr_metadata_numeric_all = am_gr_metadata[['asin', 'gr_ratings_count', 'gr_reviews_count', 'gr_pub_yr', 'gr_pub_mo', 
                                             'gr_pub_day', 'gr_countDes_before', 'gr_countDes_after', 
                                             'gr_countText_before', 'gr_countText_after', 'am_ratings_count', 
                                             'am_reviews_count', 'am_rank', 'am_verifiedTrue_count',
                                             'am_countText_before', 'am_countText_after', 'rating_diff']]

In [11]:
am_gr_metadata_numeric_no_null = am_gr_metadata[['asin', 'gr_ratings_count', 'gr_reviews_count', 'gr_pub_yr', 'gr_pub_mo', 
                                                 'gr_pub_day', 'gr_countDes_before', 'gr_countDes_after', 
                                                 'gr_countText_before', 'gr_countText_after', 'am_ratings_count', 
                                                 'am_reviews_count', 'am_rank', 'am_verifiedTrue_count',
                                                 'am_countText_before', 'am_countText_after', 'rating_diff']]

In [12]:
am_gr_metadata_numeric_no_null = am_gr_metadata_numeric_no_null.dropna()

In [13]:
df = pd.get_dummies(am_gr_metadata['gr_genres'].str.get_dummies(sep=','))
df = pd.concat([df, am_gr_metadata['asin']], axis=1)
df = pd.merge(df, am_gr_metadata_numeric, how='right', on='asin')
df

Unnamed: 0,biography,children,comics,crime,fantasy,fiction,graphic,historicalfiction,history,mystery,...,gr_reviews_count,gr_countText_before,gr_countText_after,am_ratings_count,am_reviews_count,am_rank,am_verifiedTrue_count,am_countText_before,am_countText_after,rating_diff
0,0,0,0,0,0,1,0,0,0,0,...,8847,42320,17834,1453,1453,1810945,1130,69909,31772,0.41
1,1,0,1,0,0,1,1,1,1,0,...,85,158,75,50,50,9799161,43,4888,2240,0.40
2,0,1,0,0,0,0,0,0,0,0,...,36,49,18,45,45,321557,30,3085,1326,0.25
3,1,1,0,0,0,1,0,1,1,0,...,75,130,61,17,17,1542999,13,788,399,0.49
4,0,1,0,0,1,1,0,0,0,0,...,65,257,117,107,107,2884610,69,5667,2574,0.15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37228,0,0,0,1,0,0,0,0,0,1,...,8,219,94,13,13,443719,4,2599,1216,-0.11
37229,0,0,0,1,0,0,0,0,0,1,...,4,125,52,12,12,3470182,6,1489,668,0.20
37230,0,0,0,1,0,1,0,0,0,1,...,5,362,184,14,14,3412599,4,1456,683,-0.07
37231,0,0,0,1,0,0,0,0,0,1,...,4,152,76,15,15,2606128,9,968,450,-0.45


In [14]:
y = np.array(df['rating_diff'])
x = df.drop('asin', axis=1).drop('rating_diff', axis=1)

# Saving feature names for later use
features = list(x.columns)

x = np.array(x)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)


## TRAIN MODEL

In [15]:
rf = RandomForestRegressor(n_estimators = 1000, random_state = 0)


# Train the model on training data
rf.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=1000,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [16]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
#max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_leaf_nodes = [int(x) for x in np.linspace(start = 1000, stop = 10000, num = 10)]
max_depth = [30,40,50]
#max_depth.append(None)
# Minimum number of samples required to split a node
#min_samples_split = [2, 5, 10]
min_samples_split = [int(x) for x in np.linspace(start = 5, stop = 50, num = 10)]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2,3,4,5,6,7,8,9,10]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_leaf_nodes':max_leaf_nodes,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [100, 311, 522, 733, 944, 1155, 1366, 1577, 1788, 2000], 'max_features': ['auto', 'sqrt'], 'max_leaf_nodes': [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000], 'max_depth': [30, 40, 50], 'min_samples_split': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50], 'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'bootstrap': [True, False]}


In [17]:
ran = RandomForestRegressor()
rg = RandomizedSearchCV(estimator = ran, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, 
                            random_state=42)

In [18]:
rg.fit(x_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] n_estimators=1155, min_samples_split=50, min_samples_leaf=8, max_leaf_nodes=6000, max_features=sqrt, max_depth=30, bootstrap=True 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=1155, min_samples_split=50, min_samples_leaf=8, max_leaf_nodes=6000, max_features=sqrt, max_depth=30, bootstrap=True, total=  32.7s
[CV] n_estimators=1155, min_samples_split=50, min_samples_leaf=8, max_leaf_nodes=6000, max_features=sqrt, max_depth=30, bootstrap=True 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   32.7s remaining:    0.0s


[CV]  n_estimators=1155, min_samples_split=50, min_samples_leaf=8, max_leaf_nodes=6000, max_features=sqrt, max_depth=30, bootstrap=True, total=  32.9s
[CV] n_estimators=1155, min_samples_split=50, min_samples_leaf=8, max_leaf_nodes=6000, max_features=sqrt, max_depth=30, bootstrap=True 
[CV]  n_estimators=1155, min_samples_split=50, min_samples_leaf=8, max_leaf_nodes=6000, max_features=sqrt, max_depth=30, bootstrap=True, total=  33.3s
[CV] n_estimators=100, min_samples_split=35, min_samples_leaf=9, max_leaf_nodes=1000, max_features=auto, max_depth=30, bootstrap=True 
[CV]  n_estimators=100, min_samples_split=35, min_samples_leaf=9, max_leaf_nodes=1000, max_features=auto, max_depth=30, bootstrap=True, total=  13.1s
[CV] n_estimators=100, min_samples_split=35, min_samples_leaf=9, max_leaf_nodes=1000, max_features=auto, max_depth=30, bootstrap=True 
[CV]  n_estimators=100, min_samples_split=35, min_samples_leaf=9, max_leaf_nodes=1000, max_features=auto, max_depth=30, bootstrap=True, total=

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed: 48.4min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators='warn',
                                                   n_jobs=None, oob_score=False,
                                                   random_sta...


In [19]:
rg.best_params_

{'n_estimators': 1155,
 'min_samples_split': 35,
 'min_samples_leaf': 3,
 'max_leaf_nodes': 7000,
 'max_features': 'auto',
 'max_depth': 30,
 'bootstrap': True}

In [15]:
rf = RandomForestRegressor(n_estimators = 1155, min_samples_split = 35, min_samples_leaf = 3, max_leaf_nodes = 7000,
                           max_features = 'auto', max_depth = 30, bootstrap = True)
rf.fit(x_train, y_train)
 

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=30,
                      max_features='auto', max_leaf_nodes=7000,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=3, min_samples_split=35,
                      min_weight_fraction_leaf=0.0, n_estimators=1155,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [37]:
predictions = rf.predict(x_train)
print('Mean Absolute Error:', mean_absolute_error(y_train, predictions))
print('Mean Squared Error:', mean_squared_error(y_train, predictions))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_train, predictions)))
print('R2 Score:', rf.score(x_train, y_train))

# Use the forest's predict method on the test data
predictions = rf.predict(x_test)
print('Mean Absolute Error:', mean_absolute_error(y_test, predictions))
print('Mean Squared Error:', mean_squared_error(y_test, predictions))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, predictions)))
print('R2 Score:', rf.score(x_test, y_test))

Mean Absolute Error: 0.1783616199895895
Mean Squared Error: 0.06046753773668129
Root Mean Squared Error: 0.24590147973666462
R2 Score: 0.46158431302684333
Mean Absolute Error: 0.22629821303037823
Mean Squared Error: 0.09633397718874938
Root Mean Squared Error: 0.3103771531359056
R2 Score: 0.14564155263246403


In [16]:
predictions = rf.predict(x_train)
print('Mean Absolute Error:', mean_absolute_error(y_train, predictions))
print('Mean Squared Error:', mean_squared_error(y_train, predictions))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_train, predictions)))
r2 = rf.score(x_train, y_train)
n = x_train.shape[0]
p = x_train.shape[1]
adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
print('R2 adjusted Score:',adjusted_r2 )

# Use the forest's predict method on the test data
predictions = rf.predict(x_test)
print('Mean Absolute Error:', mean_absolute_error(y_test, predictions))
print('Mean Squared Error:', mean_squared_error(y_test, predictions))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, predictions)))
r2 = rf.score(x_test, y_test)
n = x_test.shape[0]
p = x_test.shape[1]
adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
print('R2 adjusted Score:', adjusted_r2)

Mean Absolute Error: 0.17895624868700255
Mean Squared Error: 0.0608758948538634
Root Mean Squared Error: 0.24673040926051942
R2 adjusted Score: 0.4574430230934279
Mean Absolute Error: 0.22659475012656033
Mean Squared Error: 0.09682168230542396
Root Mean Squared Error: 0.3111618265556107
R2 adjusted Score: 0.13891095901235384
