In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Load Data

In [2]:
gr = pd.read_csv('official_goodreads_metadata.csv')
am = pd.read_csv('official_amazon_metadata.csv')
am_gr = pd.merge(gr[['asin', 'average_rating', 'total_ratings_count', 'total_reviews_count', 'total_text_reviews_count',
                    'publication_year', 'publication_month', 'publication_day', 'num_pages', 'format', 'gr_countDes_before',
                    'gr_countDes_after', 'cleaned_genres', 'gr_countText_before', 'gr_countText_after']],
                 am[['asin', 'average', 'rating_count', 'text_reviews_count', 'rank', 'verifiedTrue_count', 'Format',
                    'am_countText_before', 'am_countText_after']], how='inner', on='asin')
am_gr = am_gr.rename(columns={'average_rating':'gr_rating', 'total_ratings_count':'gr_ratings_count', 
                              'total_reviews_count':'gr_reviews_count', 'total_text_reviews_count':'gr_text_reviews_count',
                              'publication_year':'gr_pub_yr', 'publication_month':'gr_pub_mo', 'publication_day':'gr_pub_day',
                              'num_pages':'gr_num_pages', 'format':'gr_format', 'cleaned_genres':'gr_genres', 
                              'average':'am_rating', 'rating_count':'am_ratings_count', 
                              'text_reviews_count':'am_text_reviews_count', 'rank':'am_rank',
                              'verifiedTrue_count':'am_verifiedTrue_count', 'Format':'am_format'})
am_gr['rating_diff'] = am_gr['gr_rating'] - am_gr['am_rating']
am_gr['ratings_count'] = am_gr['gr_ratings_count'] + am_gr['am_ratings_count']
am_gr['text_reviews_count'] = am_gr['gr_text_reviews_count'] + am_gr['am_text_reviews_count']
am_gr = am_gr.drop('gr_ratings_count', axis=1)
am_gr = am_gr.drop('gr_reviews_count', axis=1)
am_gr = am_gr.drop('gr_text_reviews_count', axis=1)
am_gr = am_gr.drop('am_ratings_count', axis=1)
am_gr = am_gr.drop('am_text_reviews_count', axis=1)
am_gr = am_gr.drop('gr_rating', axis=1)
am_gr = am_gr.drop('am_rating', axis=1)
am_gr

Unnamed: 0,asin,gr_pub_yr,gr_pub_mo,gr_pub_day,gr_num_pages,gr_format,gr_countDes_before,gr_countDes_after,gr_genres,gr_countText_before,gr_countText_after,am_rank,am_verifiedTrue_count,am_format,am_countText_before,am_countText_after,rating_diff,ratings_count,text_reviews_count
0,000100039X,2010.0,1.0,1.0,127.0,Paperback,106.0,66.0,"poetry, fiction, non-fiction",42320,17834,1810945,1130,,69909,31772,-0.41,221541,10300
1,0001053655,1997.0,,,268.0,Hardcover,,,"history, historical fiction, biography, non-fi...",158,75,9799161,43,"Kindle Edition, Paperback, Hardcover, Audi...",4888,2240,-0.40,726,135
2,0001061240,1959.0,12.0,1.0,324.0,Hardcover,,,"poetry, children",49,18,321557,30,Hardcover,3085,1326,-0.25,266,81
3,000161102X,,,,190.0,,47.0,25.0,"children, fiction, young-adult, history, histo...",130,61,1542999,13,,788,399,-0.49,2946,92
4,0001711296,,,,63.0,,,,"children, fiction, poetry, fantasy, paranormal",257,117,2884610,69,"Library Binding, VHS Tape, Paperback, Hard...",5667,2574,-0.15,845,172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37228,0312953240,1995.0,7.0,15.0,570.0,Paperback,41.0,20.0,"mystery, thriller, crime, non-fiction",219,94,443719,4,"Mass Market Paperback, Hardcover",2599,1216,0.11,100,21
37229,0312955138,1995.0,9.0,15.0,320.0,Paperback,156.0,82.0,"mystery, thriller, crime, non-fiction",125,52,3470182,6,"Kindle Edition, Hardcover",1489,668,-0.20,41,16
37230,0312955154,1995.0,10.0,15.0,,Paperback,48.0,33.0,"mystery, thriller, crime, fiction",362,184,3412599,4,"Kindle Edition, Paperback",1456,683,0.07,70,19
37231,0312956878,1995.0,10.0,15.0,608.0,Paperback,187.0,103.0,"mystery, thriller, crime, non-fiction",152,76,2606128,9,"Mass Market Paperback, Hardcover",968,450,0.45,74,19


# Numeric Features

In [3]:
am_gr.isnull().sum()

asin                        0
gr_pub_yr                1445
gr_pub_mo                2012
gr_pub_day               2212
gr_num_pages             1630
gr_format                1535
gr_countDes_before       1027
gr_countDes_after        1748
gr_genres                 199
gr_countText_before         0
gr_countText_after          0
am_rank                     0
am_verifiedTrue_count       0
am_format                  49
am_countText_before         0
am_countText_after          0
rating_diff                 0
ratings_count               0
text_reviews_count          0
dtype: int64

In [4]:
X =  am_gr[['gr_countText_before', 'gr_countText_after', 'am_rank', 'am_verifiedTrue_count', 'am_countText_before',
            'am_countText_after', 'ratings_count', 'text_reviews_count']]
Y = am_gr['rating_diff']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.25, random_state=0)
# normalize features
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
# fit model no training data
model = MLPRegressor(hidden_layer_sizes=(14,14,14),max_iter=500)
model.fit(x_train, y_train)
print(model)
# make predictions for train data
y_pred = model.predict(x_train)
print('TRAIN DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_train, y_pred))
print('Mean Squared Error:', mean_squared_error(y_train, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_train, y_pred)))
print('R2 Score:', r2_score(y_train, y_pred))
# make predictions for test data
y_pred = model.predict(x_test)
print('TEST DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2 Score:', r2_score(y_test, y_pred))

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(14, 14, 14), learning_rate='constant',
             learning_rate_init=0.001, max_iter=500, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='adam', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)
TRAIN DATA:
Mean Absolute Error: 0.23786310198771948
Mean Squared Error: 0.1045902287749011
Root Mean Squared Error: 0.32340412609442865
R2 Score: 0.06870658233604654
TEST DATA:
Mean Absolute Error: 0.24009930378277686
Mean Squared Error: 0.10611681199561204
Root Mean Squared Error: 0.32575575512277916
R2 Score: 0.05888039317085303


### All numeric features w/ null rows removed

In [5]:
am_gr_numeric_all = am_gr[['asin', 'gr_pub_yr', 'gr_pub_mo', 'gr_pub_day', 'gr_countDes_before', 'gr_countDes_after', 
                           'gr_countText_before', 'gr_countText_after', 'am_rank', 'am_verifiedTrue_count', 
                           'am_countText_before', 'am_countText_after', 'rating_diff', 'ratings_count', 'text_reviews_count']]

In [6]:
am_gr_numeric_no_null = am_gr_numeric_all.dropna()

In [7]:
X =  am_gr_numeric_no_null.drop('asin', axis=1).drop('rating_diff', axis=1)
Y = am_gr_numeric_no_null['rating_diff']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.25, random_state=0)
# normalize features
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
# fit model no training data
model = MLPRegressor(hidden_layer_sizes=(14,14,14),max_iter=500)
model.fit(x_train, y_train)
print(model)
# make predictions for train data
y_pred = model.predict(x_train)
print('TRAIN DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_train, y_pred))
print('Mean Squared Error:', mean_squared_error(y_train, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_train, y_pred)))
print('R2 Score:', r2_score(y_train, y_pred))
# make predictions for test data
y_pred = model.predict(x_test)
print('TEST DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2 Score:', r2_score(y_test, y_pred))

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(14, 14, 14), learning_rate='constant',
             learning_rate_init=0.001, max_iter=500, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='adam', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)
TRAIN DATA:
Mean Absolute Error: 0.23645375430125162
Mean Squared Error: 0.10254021322182508
Root Mean Squared Error: 0.32021900821441734
R2 Score: 0.0815204910077223
TEST DATA:
Mean Absolute Error: 0.2374347419066162
Mean Squared Error: 0.10499411636769387
Root Mean Squared Error: 0.3240279561514621
R2 Score: 0.07273423383488054


Verdict: Not much better than the last one

### All numeric features w/ null values replaced by mean

In [8]:
am_gr_numeric_mean = am_gr_numeric_all

In [9]:
am_gr_numeric_mean.isnull().sum()

asin                        0
gr_pub_yr                1445
gr_pub_mo                2012
gr_pub_day               2212
gr_countDes_before       1027
gr_countDes_after        1748
gr_countText_before         0
gr_countText_after          0
am_rank                     0
am_verifiedTrue_count       0
am_countText_before         0
am_countText_after          0
rating_diff                 0
ratings_count               0
text_reviews_count          0
dtype: int64

In [10]:
# Fill null values with mean
am_gr_numeric_mean['gr_pub_yr'].fillna(am_gr_numeric_mean['gr_pub_yr'].mean(), inplace=True)
am_gr_numeric_mean['gr_pub_mo'].fillna(am_gr_numeric_mean['gr_pub_mo'].mean(), inplace=True)
am_gr_numeric_mean['gr_pub_day'].fillna(am_gr_numeric_mean['gr_pub_day'].mean(), inplace=True)
am_gr_numeric_mean['gr_countDes_before'].fillna(am_gr_numeric_mean['gr_countDes_before'].mean(), inplace=True)
# If gr_countDes_after value is null, copy the value from gr_countDes_before
am_gr_numeric_mean['gr_countDes_after'] = np.where(am_gr_numeric_mean['gr_countDes_after'].isnull(), am_gr_numeric_mean['gr_countDes_before'], am_gr_numeric_mean['gr_countDes_after'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [11]:
am_gr_numeric_mean.isnull().sum()

asin                     0
gr_pub_yr                0
gr_pub_mo                0
gr_pub_day               0
gr_countDes_before       0
gr_countDes_after        0
gr_countText_before      0
gr_countText_after       0
am_rank                  0
am_verifiedTrue_count    0
am_countText_before      0
am_countText_after       0
rating_diff              0
ratings_count            0
text_reviews_count       0
dtype: int64

In [12]:
X =  am_gr_numeric_mean.drop('asin', axis=1).drop('rating_diff', axis=1)
Y = am_gr_numeric_mean['rating_diff']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.25, random_state=0)
# normalize features
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
# fit model no training data
model = MLPRegressor(hidden_layer_sizes=(14,14,14),max_iter=500)
model.fit(x_train, y_train)
print(model)
# make predictions for train data
y_pred = model.predict(x_train)
print('TRAIN DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_train, y_pred))
print('Mean Squared Error:', mean_squared_error(y_train, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_train, y_pred)))
print('R2 Score:', r2_score(y_train, y_pred))
# make predictions for test data
y_pred = model.predict(x_test)
print('TEST DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2 Score:', r2_score(y_test, y_pred))

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(14, 14, 14), learning_rate='constant',
             learning_rate_init=0.001, max_iter=500, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='adam', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)
TRAIN DATA:
Mean Absolute Error: 0.23613797208911066
Mean Squared Error: 0.10261757471601843
Root Mean Squared Error: 0.3203397801023445
R2 Score: 0.08627150940317596
TEST DATA:
Mean Absolute Error: 0.23946998499136268
Mean Squared Error: 0.1050815229979144
Root Mean Squared Error: 0.3241628032299733
R2 Score: 0.06806207471729864


Even better!!!

### All numeric features w/ null values replaced by median

In [13]:
am_gr_numeric_median = am_gr_numeric_all

In [14]:
# Fill null values with mean
am_gr_numeric_median['gr_pub_yr'].fillna(am_gr_numeric_median['gr_pub_yr'].median(), inplace=True)
am_gr_numeric_median['gr_pub_mo'].fillna(am_gr_numeric_median['gr_pub_mo'].median(), inplace=True)
am_gr_numeric_median['gr_pub_day'].fillna(am_gr_numeric_median['gr_pub_day'].median(), inplace=True)
am_gr_numeric_median['gr_countDes_before'].fillna(am_gr_numeric_median['gr_countDes_before'].median(), inplace=True)
# If gr_countDes_after value is null, copy the value from gr_countDes_before
am_gr_numeric_median['gr_countDes_after'] = np.where(am_gr_numeric_median['gr_countDes_after'].isnull(), am_gr_numeric_median['gr_countDes_before'], am_gr_numeric_median['gr_countDes_after'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [15]:
X =  am_gr_numeric_median.drop('asin', axis=1).drop('rating_diff', axis=1)
Y = am_gr_numeric_median['rating_diff']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.25, random_state=0)
# normalize features
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
# fit model no training data
model = MLPRegressor(hidden_layer_sizes=(14,14,14),max_iter=500)
model.fit(x_train, y_train)
print(model)
# make predictions for train data
y_pred = model.predict(x_train)
print('TRAIN DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_train, y_pred))
print('Mean Squared Error:', mean_squared_error(y_train, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_train, y_pred)))
print('R2 Score:', r2_score(y_train, y_pred))
# make predictions for test data
y_pred = model.predict(x_test)
print('TEST DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2 Score:', r2_score(y_test, y_pred))

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(14, 14, 14), learning_rate='constant',
             learning_rate_init=0.001, max_iter=500, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='adam', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)
TRAIN DATA:
Mean Absolute Error: 0.23757282787773895
Mean Squared Error: 0.10329496598234364
Root Mean Squared Error: 0.32139534219142574
R2 Score: 0.08023987494839868
TEST DATA:
Mean Absolute Error: 0.24081286042323347
Mean Squared Error: 0.10595214944481854
Root Mean Squared Error: 0.3255029177208993
R2 Score: 0.06034073816377161


Verdict: Not as good as replacing it with the mean, about the same as removing null rows.

# Adding Categorical Variables

### Adding gr_format

In [16]:
df = pd.get_dummies(am_gr['gr_format'].str.strip().str.lower())
df = pd.concat([df, am_gr['asin']], axis=1)
df = pd.merge(df, am_gr_numeric_mean, how='right', on='asin')
df

Unnamed: 0,audible audio,audio,audio cassette,audio cd,audio cd (unabridged),audiobook,b,big book,board book,boxed set - hardcover,...,gr_countDes_after,gr_countText_before,gr_countText_after,am_rank,am_verifiedTrue_count,am_countText_before,am_countText_after,rating_diff,ratings_count,text_reviews_count
0,0,0,0,0,0,0,0,0,0,0,...,66.000000,42320,17834,1810945,1130,69909,31772,-0.41,221541,10300
1,0,0,0,0,0,0,0,0,0,0,...,160.689085,158,75,9799161,43,4888,2240,-0.40,726,135
2,0,0,0,0,0,0,0,0,0,0,...,160.689085,49,18,321557,30,3085,1326,-0.25,266,81
3,0,0,0,0,0,0,0,0,0,0,...,25.000000,130,61,1542999,13,788,399,-0.49,2946,92
4,0,0,0,0,0,0,0,0,0,0,...,160.689085,257,117,2884610,69,5667,2574,-0.15,845,172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37228,0,0,0,0,0,0,0,0,0,0,...,20.000000,219,94,443719,4,2599,1216,0.11,100,21
37229,0,0,0,0,0,0,0,0,0,0,...,82.000000,125,52,3470182,6,1489,668,-0.20,41,16
37230,0,0,0,0,0,0,0,0,0,0,...,33.000000,362,184,3412599,4,1456,683,0.07,70,19
37231,0,0,0,0,0,0,0,0,0,0,...,103.000000,152,76,2606128,9,968,450,0.45,74,19


In [17]:
X =  df.drop('asin', axis=1).drop('rating_diff', axis=1)
Y = df['rating_diff']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.25, random_state=0)
# normalize features
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
# fit model no training data
model = MLPRegressor(hidden_layer_sizes=(14,14,14),max_iter=500)
model.fit(x_train, y_train)
print(model)
# make predictions for train data
y_pred = model.predict(x_train)
print('TRAIN DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_train, y_pred))
print('Mean Squared Error:', mean_squared_error(y_train, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_train, y_pred)))
print('R2 Score:', r2_score(y_train, y_pred))
# make predictions for test data
y_pred = model.predict(x_test)
print('TEST DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2 Score:', r2_score(y_test, y_pred))

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(14, 14, 14), learning_rate='constant',
             learning_rate_init=0.001, max_iter=500, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='adam', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)
TRAIN DATA:
Mean Absolute Error: 0.23774729390137228
Mean Squared Error: 0.10318032542033881
Root Mean Squared Error: 0.3212169444788659
R2 Score: 0.08126065864916066
TEST DATA:
Mean Absolute Error: 0.24165933919654553
Mean Squared Error: 0.1059840972019457
Root Mean Squared Error: 0.32555198847794753
R2 Score: 0.0600574026577273


Not as good

### Adding gr_genres

In [18]:
am_gr['gr_genres'] = am_gr['gr_genres'].str.replace(' ','')

In [19]:
df = pd.get_dummies(am_gr['gr_genres'].str.get_dummies(sep=','))
df = pd.concat([df, am_gr['asin']], axis=1)
df = pd.merge(df, am_gr_numeric_mean, how='right', on='asin')
df

Unnamed: 0,biography,children,comics,crime,fantasy,fiction,graphic,historicalfiction,history,mystery,...,gr_countDes_after,gr_countText_before,gr_countText_after,am_rank,am_verifiedTrue_count,am_countText_before,am_countText_after,rating_diff,ratings_count,text_reviews_count
0,0,0,0,0,0,1,0,0,0,0,...,66.000000,42320,17834,1810945,1130,69909,31772,-0.41,221541,10300
1,1,0,1,0,0,1,1,1,1,0,...,160.689085,158,75,9799161,43,4888,2240,-0.40,726,135
2,0,1,0,0,0,0,0,0,0,0,...,160.689085,49,18,321557,30,3085,1326,-0.25,266,81
3,1,1,0,0,0,1,0,1,1,0,...,25.000000,130,61,1542999,13,788,399,-0.49,2946,92
4,0,1,0,0,1,1,0,0,0,0,...,160.689085,257,117,2884610,69,5667,2574,-0.15,845,172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37228,0,0,0,1,0,0,0,0,0,1,...,20.000000,219,94,443719,4,2599,1216,0.11,100,21
37229,0,0,0,1,0,0,0,0,0,1,...,82.000000,125,52,3470182,6,1489,668,-0.20,41,16
37230,0,0,0,1,0,1,0,0,0,1,...,33.000000,362,184,3412599,4,1456,683,0.07,70,19
37231,0,0,0,1,0,0,0,0,0,1,...,103.000000,152,76,2606128,9,968,450,0.45,74,19


In [20]:
df.columns

Index(['biography', 'children', 'comics', 'crime', 'fantasy', 'fiction',
       'graphic', 'historicalfiction', 'history', 'mystery', 'non-fiction',
       'paranormal', 'poetry', 'romance', 'thriller', 'young-adult', 'asin',
       'gr_pub_yr', 'gr_pub_mo', 'gr_pub_day', 'gr_countDes_before',
       'gr_countDes_after', 'gr_countText_before', 'gr_countText_after',
       'am_rank', 'am_verifiedTrue_count', 'am_countText_before',
       'am_countText_after', 'rating_diff', 'ratings_count',
       'text_reviews_count'],
      dtype='object')

In [21]:
X =  df.drop('asin', axis=1).drop('rating_diff', axis=1)
Y = df['rating_diff']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.25, random_state=0)
# normalize features
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
# fit model no training data
model = MLPRegressor(hidden_layer_sizes=(14,14,14),max_iter=500)
model.fit(x_train, y_train)
print(model)
# make predictions for train data
y_pred = model.predict(x_train)
print('TRAIN DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_train, y_pred))
print('Mean Squared Error:', mean_squared_error(y_train, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_train, y_pred)))
print('R2 Score:', r2_score(y_train, y_pred))
# make predictions for test data
y_pred = model.predict(x_test)
print('TEST DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2 Score:', r2_score(y_test, y_pred))

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(14, 14, 14), learning_rate='constant',
             learning_rate_init=0.001, max_iter=500, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='adam', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)
TRAIN DATA:
Mean Absolute Error: 0.22962216629932292
Mean Squared Error: 0.0982336306814491
Root Mean Squared Error: 0.31342244763489596
R2 Score: 0.1253070700920088
TEST DATA:
Mean Absolute Error: 0.23635039109720085
Mean Squared Error: 0.10368137446814875
Root Mean Squared Error: 0.3219959230613778
R2 Score: 0.08047959093414425


Best one so far

### Adding am_format

In [22]:
am_gr['am_format'] = am_gr['am_format'].str.replace(' ','')

In [23]:
df = pd.get_dummies(am_gr['am_format'].str.get_dummies(sep=','))
df = pd.concat([df, am_gr['asin']], axis=1)
df = pd.merge(df, am_gr_numeric_mean, how='right', on='asin')
df

Unnamed: 0,Accessory,AmazonVideo,AudibleAudiobook,AudioCD,AudioCDLibraryBinding,AudioCassette,BargainBook,BathBook,Blu-ray,Boardbook,...,gr_countDes_after,gr_countText_before,gr_countText_after,am_rank,am_verifiedTrue_count,am_countText_before,am_countText_after,rating_diff,ratings_count,text_reviews_count
0,0,0,0,0,0,0,0,0,0,0,...,66.000000,42320,17834,1810945,1130,69909,31772,-0.41,221541,10300
1,0,0,0,0,0,1,0,0,0,0,...,160.689085,158,75,9799161,43,4888,2240,-0.40,726,135
2,0,0,0,0,0,0,0,0,0,0,...,160.689085,49,18,321557,30,3085,1326,-0.25,266,81
3,0,0,0,0,0,0,0,0,0,0,...,25.000000,130,61,1542999,13,788,399,-0.49,2946,92
4,0,0,0,0,0,0,0,0,0,0,...,160.689085,257,117,2884610,69,5667,2574,-0.15,845,172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37228,0,0,0,0,0,0,0,0,0,0,...,20.000000,219,94,443719,4,2599,1216,0.11,100,21
37229,0,0,0,0,0,0,0,0,0,0,...,82.000000,125,52,3470182,6,1489,668,-0.20,41,16
37230,0,0,0,0,0,0,0,0,0,0,...,33.000000,362,184,3412599,4,1456,683,0.07,70,19
37231,0,0,0,0,0,0,0,0,0,0,...,103.000000,152,76,2606128,9,968,450,0.45,74,19


In [24]:
X =  df.drop('asin', axis=1).drop('rating_diff', axis=1)
Y = df['rating_diff']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.25, random_state=0)
# normalize features
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
# fit model no training data
model = MLPRegressor(hidden_layer_sizes=(14,14,14),max_iter=500)
model.fit(x_train, y_train)
print(model)
# make predictions for train data
y_pred = model.predict(x_train)
print('TRAIN DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_train, y_pred))
print('Mean Squared Error:', mean_squared_error(y_train, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_train, y_pred)))
print('R2 Score:', r2_score(y_train, y_pred))
# make predictions for test data
y_pred = model.predict(x_test)
print('TEST DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2 Score:', r2_score(y_test, y_pred))

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(14, 14, 14), learning_rate='constant',
             learning_rate_init=0.001, max_iter=500, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='adam', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)
TRAIN DATA:
Mean Absolute Error: 0.23337744133590396
Mean Squared Error: 0.10010175664045723
Root Mean Squared Error: 0.3163886164836801
R2 Score: 0.10867288323373381
TEST DATA:
Mean Absolute Error: 0.2413723116327046
Mean Squared Error: 0.10689032297862434
Root Mean Squared Error: 0.3269408554748463
R2 Score: 0.05202034584639459


Not worth

# Hyperparameter Tuning

In [25]:
am_gr['gr_genres'] = am_gr['gr_genres'].str.replace(' ','')

In [26]:
df = pd.get_dummies(am_gr['gr_genres'].str.get_dummies(sep=','))
df = pd.concat([df, am_gr['asin']], axis=1)
df = pd.merge(df, am_gr_numeric_mean, how='right', on='asin')
df

Unnamed: 0,biography,children,comics,crime,fantasy,fiction,graphic,historicalfiction,history,mystery,...,gr_countDes_after,gr_countText_before,gr_countText_after,am_rank,am_verifiedTrue_count,am_countText_before,am_countText_after,rating_diff,ratings_count,text_reviews_count
0,0,0,0,0,0,1,0,0,0,0,...,66.000000,42320,17834,1810945,1130,69909,31772,-0.41,221541,10300
1,1,0,1,0,0,1,1,1,1,0,...,160.689085,158,75,9799161,43,4888,2240,-0.40,726,135
2,0,1,0,0,0,0,0,0,0,0,...,160.689085,49,18,321557,30,3085,1326,-0.25,266,81
3,1,1,0,0,0,1,0,1,1,0,...,25.000000,130,61,1542999,13,788,399,-0.49,2946,92
4,0,1,0,0,1,1,0,0,0,0,...,160.689085,257,117,2884610,69,5667,2574,-0.15,845,172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37228,0,0,0,1,0,0,0,0,0,1,...,20.000000,219,94,443719,4,2599,1216,0.11,100,21
37229,0,0,0,1,0,0,0,0,0,1,...,82.000000,125,52,3470182,6,1489,668,-0.20,41,16
37230,0,0,0,1,0,1,0,0,0,1,...,33.000000,362,184,3412599,4,1456,683,0.07,70,19
37231,0,0,0,1,0,0,0,0,0,1,...,103.000000,152,76,2606128,9,968,450,0.45,74,19


In [27]:
X =  df.drop('asin', axis=1).drop('rating_diff', axis=1)
Y = df['rating_diff']

### hidden_layer_sizes

Verdict: Keep it at 14

### activation

Verdict: activation = 'relu'

### solver

In [34]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.25, random_state=0)
# normalize features
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
# fit model no training data
model = MLPRegressor(hidden_layer_sizes=(14,), solver='lbfgs', max_iter=500)
model.fit(x_train, y_train)
print(model)
# make predictions for train data
y_pred = model.predict(x_train)
print('TRAIN DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_train, y_pred))
print('Mean Squared Error:', mean_squared_error(y_train, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_train, y_pred)))
print('R2 Score:', r2_score(y_train, y_pred))
# make predictions for test data
y_pred = model.predict(x_test)
print('TEST DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2 Score:', r2_score(y_test, y_pred))

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(14,), learning_rate='constant',
             learning_rate_init=0.001, max_iter=500, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)
TRAIN DATA:
Mean Absolute Error: 0.23015568819080812
Mean Squared Error: 0.09830625360992003
Root Mean Squared Error: 0.3135382809322014
R2 Score: 0.12466042024671575
TEST DATA:
Mean Absolute Error: 0.23440243835680682
Mean Squared Error: 0.1021820982113065
Root Mean Squared Error: 0.3196593471358322
R2 Score: 0.09377624256580075


Verdict: solver='lbfgs'

### alpha

In [37]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.25, random_state=0)
# normalize features
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
# fit model no training data
model = MLPRegressor(hidden_layer_sizes=(14,), solver='lbfgs', alpha=0.001, max_iter=500)
model.fit(x_train, y_train)
print(model)
# make predictions for train data
y_pred = model.predict(x_train)
print('TRAIN DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_train, y_pred))
print('Mean Squared Error:', mean_squared_error(y_train, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_train, y_pred)))
print('R2 Score:', r2_score(y_train, y_pred))
# make predictions for test data
y_pred = model.predict(x_test)
print('TEST DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2 Score:', r2_score(y_test, y_pred))

MLPRegressor(activation='relu', alpha=0.001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(14,), learning_rate='constant',
             learning_rate_init=0.001, max_iter=500, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)
TRAIN DATA:
Mean Absolute Error: 0.23001336115322873
Mean Squared Error: 0.09826593813590476
Root Mean Squared Error: 0.31347398318824604
R2 Score: 0.12501939771545445
TEST DATA:
Mean Absolute Error: 0.233364356318945
Mean Squared Error: 0.10167990231034822
Root Mean Squared Error: 0.318872862298359
R2 Score: 0.098230074149816


In [40]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.25, random_state=0)
# normalize features
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
# fit model no training data
model = MLPRegressor(hidden_layer_sizes=(14,), solver='lbfgs', alpha=0.002, max_iter=500)
model.fit(x_train, y_train)
print(model)
# make predictions for train data
y_pred = model.predict(x_train)
print('TRAIN DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_train, y_pred))
print('Mean Squared Error:', mean_squared_error(y_train, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_train, y_pred)))
print('R2 Score:', r2_score(y_train, y_pred))
# make predictions for test data
y_pred = model.predict(x_test)
print('TEST DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2 Score:', r2_score(y_test, y_pred))

MLPRegressor(activation='relu', alpha=0.002, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(14,), learning_rate='constant',
             learning_rate_init=0.001, max_iter=500, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)
TRAIN DATA:
Mean Absolute Error: 0.22774252705173897
Mean Squared Error: 0.09618239250606889
Root Mean Squared Error: 0.31013286266706547
R2 Score: 0.14357172667770146
TEST DATA:
Mean Absolute Error: 0.2316190806863135
Mean Squared Error: 0.09987613935056316
Root Mean Squared Error: 0.3160318644544616
R2 Score: 0.11422713112506788


Verdict: alpha=0.002

### learning_rate

Verdict: learning_rate = 'constant'

### max_iter

Verdict: keep it at 500

# Best Model

In [None]:
"""MLPRegressor(activation='relu', alpha=0.002, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(14,), learning_rate='constant',
             learning_rate_init=0.001, max_iter=500, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)
TRAIN DATA:
Mean Absolute Error: 0.22774252705173897
Mean Squared Error: 0.09618239250606889
Root Mean Squared Error: 0.31013286266706547
R2 Score: 0.14357172667770146
TEST DATA:
Mean Absolute Error: 0.2316190806863135
Mean Squared Error: 0.09987613935056316
Root Mean Squared Error: 0.3160318644544616
R2 Score: 0.11422713112506788"""