In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Load Data

In [2]:
gr = pd.read_csv('official_goodreads_metadata.csv')
am = pd.read_csv('official_amazon_metadata.csv')
am_gr = pd.merge(gr[['asin', 'average_rating', 'total_ratings_count', 'total_reviews_count', 'total_text_reviews_count',
                    'publication_year', 'publication_month', 'publication_day', 'num_pages', 'format', 'gr_countDes_before',
                    'gr_countDes_after', 'cleaned_genres', 'gr_countText_before', 'gr_countText_after']],
                 am[['asin', 'average', 'rating_count', 'text_reviews_count', 'rank', 'verifiedTrue_count', 'Format',
                    'am_countText_before', 'am_countText_after']], how='inner', on='asin')
am_gr = am_gr.rename(columns={'average_rating':'gr_rating', 'total_ratings_count':'gr_ratings_count', 
                              'total_reviews_count':'gr_reviews_count', 'total_text_reviews_count':'gr_text_reviews_count',
                              'publication_year':'gr_pub_yr', 'publication_month':'gr_pub_mo', 'publication_day':'gr_pub_day',
                              'num_pages':'gr_num_pages', 'format':'gr_format', 'cleaned_genres':'gr_genres', 
                              'average':'am_rating', 'rating_count':'am_ratings_count', 
                              'text_reviews_count':'am_text_reviews_count', 'rank':'am_rank',
                              'verifiedTrue_count':'am_verifiedTrue_count', 'Format':'am_format'})
am_gr['rating_diff'] = am_gr['gr_rating'] + am_gr['am_rating']
am_gr['ratings_count'] = am_gr['gr_ratings_count'] + am_gr['am_ratings_count']
am_gr['text_reviews_count'] = am_gr['gr_text_reviews_count'] + am_gr['am_text_reviews_count']
am_gr = am_gr.drop('gr_ratings_count', axis=1)
am_gr = am_gr.drop('gr_reviews_count', axis=1)
am_gr = am_gr.drop('gr_text_reviews_count', axis=1)
am_gr = am_gr.drop('am_ratings_count', axis=1)
am_gr = am_gr.drop('am_text_reviews_count', axis=1)
am_gr = am_gr.drop('gr_rating', axis=1)
am_gr = am_gr.drop('am_rating', axis=1)
am_gr

Unnamed: 0,asin,gr_pub_yr,gr_pub_mo,gr_pub_day,gr_num_pages,gr_format,gr_countDes_before,gr_countDes_after,gr_genres,gr_countText_before,gr_countText_after,am_rank,am_verifiedTrue_count,am_format,am_countText_before,am_countText_after,rating_diff,ratings_count,text_reviews_count
0,000100039X,2010.0,1.0,1.0,127.0,Paperback,106.0,66.0,"poetry, fiction, non-fiction",42320,17834,1810945,1130,,69909,31772,8.87,221541,10300
1,0001053655,1997.0,,,268.0,Hardcover,,,"history, historical fiction, biography, non-fi...",158,75,9799161,43,"Kindle Edition, Paperback, Hardcover, Audi...",4888,2240,8.56,726,135
2,0001061240,1959.0,12.0,1.0,324.0,Hardcover,,,"poetry, children",49,18,321557,30,Hardcover,3085,1326,9.49,266,81
3,000161102X,,,,190.0,,47.0,25.0,"children, fiction, young-adult, history, histo...",130,61,1542999,13,,788,399,8.21,2946,92
4,0001711296,,,,63.0,,,,"children, fiction, poetry, fantasy, paranormal",257,117,2884610,69,"Library Binding, VHS Tape, Paperback, Hard...",5667,2574,8.73,845,172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37228,0312953240,1995.0,7.0,15.0,570.0,Paperback,41.0,20.0,"mystery, thriller, crime, non-fiction",219,94,443719,4,"Mass Market Paperback, Hardcover",2599,1216,7.49,100,21
37229,0312955138,1995.0,9.0,15.0,320.0,Paperback,156.0,82.0,"mystery, thriller, crime, non-fiction",125,52,3470182,6,"Kindle Edition, Hardcover",1489,668,6.96,41,16
37230,0312955154,1995.0,10.0,15.0,,Paperback,48.0,33.0,"mystery, thriller, crime, fiction",362,184,3412599,4,"Kindle Edition, Paperback",1456,683,6.65,70,19
37231,0312956878,1995.0,10.0,15.0,608.0,Paperback,187.0,103.0,"mystery, thriller, crime, non-fiction",152,76,2606128,9,"Mass Market Paperback, Hardcover",968,450,7.11,74,19


# Numeric Features

In [3]:
am_gr.isnull().sum()

asin                        0
gr_pub_yr                1445
gr_pub_mo                2012
gr_pub_day               2212
gr_num_pages             1630
gr_format                1535
gr_countDes_before       1027
gr_countDes_after        1748
gr_genres                 199
gr_countText_before         0
gr_countText_after          0
am_rank                     0
am_verifiedTrue_count       0
am_format                  49
am_countText_before         0
am_countText_after          0
rating_diff                 0
ratings_count               0
text_reviews_count          0
dtype: int64

In [4]:
X =  am_gr[['gr_countText_before', 'gr_countText_after', 'am_rank', 'am_verifiedTrue_count', 'am_countText_before',
            'am_countText_after', 'ratings_count', 'text_reviews_count']]
Y = am_gr['rating_diff']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.25, random_state=0)
# normalize features
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
# fit model no training data
model = MLPRegressor(hidden_layer_sizes=(14,14,14),max_iter=500)
model.fit(x_train, y_train)
print(model)
# make predictions for train data
y_pred = model.predict(x_train)
print('TRAIN DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_train, y_pred))
print('Mean Squared Error:', mean_squared_error(y_train, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_train, y_pred)))
print('R2 Score:', r2_score(y_train, y_pred))
# make predictions for test data
y_pred = model.predict(x_test)
print('TEST DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2 Score:', r2_score(y_test, y_pred))

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(14, 14, 14), learning_rate='constant',
             learning_rate_init=0.001, max_iter=500, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='adam', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)
TRAIN DATA:
Mean Absolute Error: 0.47236834121742527
Mean Squared Error: 0.36660105055612346
Root Mean Squared Error: 0.605475887675243
R2 Score: 0.1513750809028812
TEST DATA:
Mean Absolute Error: 0.47598666685372454
Mean Squared Error: 0.368709117179065
Root Mean Squared Error: 0.6072142267594404
R2 Score: 0.15073862656603143


### All numeric features w/ null rows removed

In [5]:
am_gr_numeric_all = am_gr[['asin', 'gr_pub_yr', 'gr_pub_mo', 'gr_pub_day', 'gr_countDes_before', 'gr_countDes_after', 
                           'gr_countText_before', 'gr_countText_after', 'am_rank', 'am_verifiedTrue_count', 
                           'am_countText_before', 'am_countText_after', 'rating_diff', 'ratings_count', 'text_reviews_count']]

In [6]:
am_gr_numeric_no_null = am_gr_numeric_all.dropna()

In [7]:
X =  am_gr_numeric_no_null.drop('asin', axis=1).drop('rating_diff', axis=1)
Y = am_gr_numeric_no_null['rating_diff']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.25, random_state=0)
# normalize features
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
# fit model no training data
model = MLPRegressor(hidden_layer_sizes=(14,14,14),max_iter=500)
model.fit(x_train, y_train)
print(model)
# make predictions for train data
y_pred = model.predict(x_train)
print('TRAIN DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_train, y_pred))
print('Mean Squared Error:', mean_squared_error(y_train, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_train, y_pred)))
print('R2 Score:', r2_score(y_train, y_pred))
# make predictions for test data
y_pred = model.predict(x_test)
print('TEST DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2 Score:', r2_score(y_test, y_pred))

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(14, 14, 14), learning_rate='constant',
             learning_rate_init=0.001, max_iter=500, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='adam', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)
TRAIN DATA:
Mean Absolute Error: 0.4376911783351524
Mean Squared Error: 0.32656510522739246
Root Mean Squared Error: 0.5714587519912461
R2 Score: 0.2513988771784017
TEST DATA:
Mean Absolute Error: 0.4400113807532631
Mean Squared Error: 0.3297203325089909
Root Mean Squared Error: 0.5742127937524476
R2 Score: 0.22771489992629357


Verdict: Better than the last one

### All numeric features w/ null values replaced by mean

In [8]:
am_gr_numeric_mean = am_gr_numeric_all

In [9]:
am_gr_numeric_mean.isnull().sum()

asin                        0
gr_pub_yr                1445
gr_pub_mo                2012
gr_pub_day               2212
gr_countDes_before       1027
gr_countDes_after        1748
gr_countText_before         0
gr_countText_after          0
am_rank                     0
am_verifiedTrue_count       0
am_countText_before         0
am_countText_after          0
rating_diff                 0
ratings_count               0
text_reviews_count          0
dtype: int64

In [10]:
# Fill null values with mean
am_gr_numeric_mean['gr_pub_yr'].fillna(am_gr_numeric_mean['gr_pub_yr'].mean(), inplace=True)
am_gr_numeric_mean['gr_pub_mo'].fillna(am_gr_numeric_mean['gr_pub_mo'].mean(), inplace=True)
am_gr_numeric_mean['gr_pub_day'].fillna(am_gr_numeric_mean['gr_pub_day'].mean(), inplace=True)
am_gr_numeric_mean['gr_countDes_before'].fillna(am_gr_numeric_mean['gr_countDes_before'].mean(), inplace=True)
# If gr_countDes_after value is null, copy the value from gr_countDes_before
am_gr_numeric_mean['gr_countDes_after'] = np.where(am_gr_numeric_mean['gr_countDes_after'].isnull(), am_gr_numeric_mean['gr_countDes_before'], am_gr_numeric_mean['gr_countDes_after'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [11]:
am_gr_numeric_mean.isnull().sum()

asin                     0
gr_pub_yr                0
gr_pub_mo                0
gr_pub_day               0
gr_countDes_before       0
gr_countDes_after        0
gr_countText_before      0
gr_countText_after       0
am_rank                  0
am_verifiedTrue_count    0
am_countText_before      0
am_countText_after       0
rating_diff              0
ratings_count            0
text_reviews_count       0
dtype: int64

In [12]:
X =  am_gr_numeric_mean.drop('asin', axis=1).drop('rating_diff', axis=1)
Y = am_gr_numeric_mean['rating_diff']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.25, random_state=0)
# normalize features
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
# fit model no training data
model = MLPRegressor(hidden_layer_sizes=(14,14,14),max_iter=500)
model.fit(x_train, y_train)
print(model)
# make predictions for train data
y_pred = model.predict(x_train)
print('TRAIN DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_train, y_pred))
print('Mean Squared Error:', mean_squared_error(y_train, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_train, y_pred)))
print('R2 Score:', r2_score(y_train, y_pred))
# make predictions for test data
y_pred = model.predict(x_test)
print('TEST DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2 Score:', r2_score(y_test, y_pred))

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(14, 14, 14), learning_rate='constant',
             learning_rate_init=0.001, max_iter=500, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='adam', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)
TRAIN DATA:
Mean Absolute Error: 0.44031932388510436
Mean Squared Error: 0.32409429204702406
Root Mean Squared Error: 0.5692927999255076
R2 Score: 0.24977167427364377
TEST DATA:
Mean Absolute Error: 0.44584096256833994
Mean Squared Error: 0.3293298263673889
Root Mean Squared Error: 0.5738726569260718
R2 Score: 0.24144240643306913


Even better!!!

### All numeric features w/ null values replaced by median

In [13]:
am_gr_numeric_median = am_gr_numeric_all

In [14]:
# Fill null values with mean
am_gr_numeric_median['gr_pub_yr'].fillna(am_gr_numeric_median['gr_pub_yr'].median(), inplace=True)
am_gr_numeric_median['gr_pub_mo'].fillna(am_gr_numeric_median['gr_pub_mo'].median(), inplace=True)
am_gr_numeric_median['gr_pub_day'].fillna(am_gr_numeric_median['gr_pub_day'].median(), inplace=True)
am_gr_numeric_median['gr_countDes_before'].fillna(am_gr_numeric_median['gr_countDes_before'].median(), inplace=True)
# If gr_countDes_after value is null, copy the value from gr_countDes_before
am_gr_numeric_median['gr_countDes_after'] = np.where(am_gr_numeric_median['gr_countDes_after'].isnull(), am_gr_numeric_median['gr_countDes_before'], am_gr_numeric_median['gr_countDes_after'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [15]:
X =  am_gr_numeric_median.drop('asin', axis=1).drop('rating_diff', axis=1)
Y = am_gr_numeric_median['rating_diff']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.25, random_state=0)
# normalize features
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
# fit model no training data
model = MLPRegressor(hidden_layer_sizes=(14,14,14),max_iter=500)
model.fit(x_train, y_train)
print(model)
# make predictions for train data
y_pred = model.predict(x_train)
print('TRAIN DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_train, y_pred))
print('Mean Squared Error:', mean_squared_error(y_train, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_train, y_pred)))
print('R2 Score:', r2_score(y_train, y_pred))
# make predictions for test data
y_pred = model.predict(x_test)
print('TEST DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2 Score:', r2_score(y_test, y_pred))

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(14, 14, 14), learning_rate='constant',
             learning_rate_init=0.001, max_iter=500, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='adam', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)
TRAIN DATA:
Mean Absolute Error: 0.4557854429336585
Mean Squared Error: 0.3495385538735696
Root Mean Squared Error: 0.5912178565246229
R2 Score: 0.19087213047451201
TEST DATA:
Mean Absolute Error: 0.4620511219966786
Mean Squared Error: 0.35456426288935544
Root Mean Squared Error: 0.5954529896552333
R2 Score: 0.18331899363969972


Verdict: Not as good as replacing it with the mean, about the same as removing null rows.

# Adding Categorical Variables

### Adding gr_format

In [16]:
df = pd.get_dummies(am_gr['gr_format'].str.strip().str.lower())
df = pd.concat([df, am_gr['asin']], axis=1)
df = pd.merge(df, am_gr_numeric_mean, how='right', on='asin')
df

Unnamed: 0,audible audio,audio,audio cassette,audio cd,audio cd (unabridged),audiobook,b,big book,board book,boxed set - hardcover,...,gr_countDes_after,gr_countText_before,gr_countText_after,am_rank,am_verifiedTrue_count,am_countText_before,am_countText_after,rating_diff,ratings_count,text_reviews_count
0,0,0,0,0,0,0,0,0,0,0,...,66.000000,42320,17834,1810945,1130,69909,31772,8.87,221541,10300
1,0,0,0,0,0,0,0,0,0,0,...,160.689085,158,75,9799161,43,4888,2240,8.56,726,135
2,0,0,0,0,0,0,0,0,0,0,...,160.689085,49,18,321557,30,3085,1326,9.49,266,81
3,0,0,0,0,0,0,0,0,0,0,...,25.000000,130,61,1542999,13,788,399,8.21,2946,92
4,0,0,0,0,0,0,0,0,0,0,...,160.689085,257,117,2884610,69,5667,2574,8.73,845,172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37228,0,0,0,0,0,0,0,0,0,0,...,20.000000,219,94,443719,4,2599,1216,7.49,100,21
37229,0,0,0,0,0,0,0,0,0,0,...,82.000000,125,52,3470182,6,1489,668,6.96,41,16
37230,0,0,0,0,0,0,0,0,0,0,...,33.000000,362,184,3412599,4,1456,683,6.65,70,19
37231,0,0,0,0,0,0,0,0,0,0,...,103.000000,152,76,2606128,9,968,450,7.11,74,19


In [17]:
X =  df.drop('asin', axis=1).drop('rating_diff', axis=1)
Y = df['rating_diff']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.25, random_state=0)
# normalize features
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
# fit model no training data
model = MLPRegressor(hidden_layer_sizes=(14,14,14),max_iter=500)
model.fit(x_train, y_train)
print(model)
# make predictions for train data
y_pred = model.predict(x_train)
print('TRAIN DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_train, y_pred))
print('Mean Squared Error:', mean_squared_error(y_train, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_train, y_pred)))
print('R2 Score:', r2_score(y_train, y_pred))
# make predictions for test data
y_pred = model.predict(x_test)
print('TEST DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2 Score:', r2_score(y_test, y_pred))

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(14, 14, 14), learning_rate='constant',
             learning_rate_init=0.001, max_iter=500, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='adam', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)
TRAIN DATA:
Mean Absolute Error: 0.4457066137322126
Mean Squared Error: 0.32712848023647234
Root Mean Squared Error: 0.5719514666791863
R2 Score: 0.2427479963466712
TEST DATA:
Mean Absolute Error: 0.45160004868559456
Mean Squared Error: 0.33613211911728885
Root Mean Squared Error: 0.5797690222125436
R2 Score: 0.2257744334589309


Slightly better than model w/ just numeric nulls replaced by mean, but not worth

### Adding gr_genres

In [18]:
am_gr['gr_genres'] = am_gr['gr_genres'].str.replace(' ','')

In [19]:
df = pd.get_dummies(am_gr['gr_genres'].str.get_dummies(sep=','))
df = pd.concat([df, am_gr['asin']], axis=1)
df = pd.merge(df, am_gr_numeric_mean, how='right', on='asin')
df

Unnamed: 0,biography,children,comics,crime,fantasy,fiction,graphic,historicalfiction,history,mystery,...,gr_countDes_after,gr_countText_before,gr_countText_after,am_rank,am_verifiedTrue_count,am_countText_before,am_countText_after,rating_diff,ratings_count,text_reviews_count
0,0,0,0,0,0,1,0,0,0,0,...,66.000000,42320,17834,1810945,1130,69909,31772,8.87,221541,10300
1,1,0,1,0,0,1,1,1,1,0,...,160.689085,158,75,9799161,43,4888,2240,8.56,726,135
2,0,1,0,0,0,0,0,0,0,0,...,160.689085,49,18,321557,30,3085,1326,9.49,266,81
3,1,1,0,0,0,1,0,1,1,0,...,25.000000,130,61,1542999,13,788,399,8.21,2946,92
4,0,1,0,0,1,1,0,0,0,0,...,160.689085,257,117,2884610,69,5667,2574,8.73,845,172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37228,0,0,0,1,0,0,0,0,0,1,...,20.000000,219,94,443719,4,2599,1216,7.49,100,21
37229,0,0,0,1,0,0,0,0,0,1,...,82.000000,125,52,3470182,6,1489,668,6.96,41,16
37230,0,0,0,1,0,1,0,0,0,1,...,33.000000,362,184,3412599,4,1456,683,6.65,70,19
37231,0,0,0,1,0,0,0,0,0,1,...,103.000000,152,76,2606128,9,968,450,7.11,74,19


In [20]:
df.columns

Index(['biography', 'children', 'comics', 'crime', 'fantasy', 'fiction',
       'graphic', 'historicalfiction', 'history', 'mystery', 'non-fiction',
       'paranormal', 'poetry', 'romance', 'thriller', 'young-adult', 'asin',
       'gr_pub_yr', 'gr_pub_mo', 'gr_pub_day', 'gr_countDes_before',
       'gr_countDes_after', 'gr_countText_before', 'gr_countText_after',
       'am_rank', 'am_verifiedTrue_count', 'am_countText_before',
       'am_countText_after', 'rating_diff', 'ratings_count',
       'text_reviews_count'],
      dtype='object')

In [21]:
X =  df.drop('asin', axis=1).drop('rating_diff', axis=1)
Y = df['rating_diff']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.25, random_state=0)
# normalize features
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
# fit model no training data
model = MLPRegressor(hidden_layer_sizes=(14,14,14),max_iter=500)
model.fit(x_train, y_train)
print(model)
# make predictions for train data
y_pred = model.predict(x_train)
print('TRAIN DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_train, y_pred))
print('Mean Squared Error:', mean_squared_error(y_train, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_train, y_pred)))
print('R2 Score:', r2_score(y_train, y_pred))
# make predictions for test data
y_pred = model.predict(x_test)
print('TEST DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2 Score:', r2_score(y_test, y_pred))

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(14, 14, 14), learning_rate='constant',
             learning_rate_init=0.001, max_iter=500, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='adam', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)
TRAIN DATA:
Mean Absolute Error: 0.43413935600212455
Mean Squared Error: 0.3159371273348903
Root Mean Squared Error: 0.5620828473943057
R2 Score: 0.26865425312440083
TEST DATA:
Mean Absolute Error: 0.4426198983839659
Mean Squared Error: 0.3234295751404464
Root Mean Squared Error: 0.5687086909309954
R2 Score: 0.2550326737390036


Slightly better still but worth?

### Adding am_format

In [22]:
am_gr['am_format'] = am_gr['am_format'].str.replace(' ','')

In [23]:
df = pd.get_dummies(am_gr['am_format'].str.get_dummies(sep=','))
df = pd.concat([df, am_gr['asin']], axis=1)
df = pd.merge(df, am_gr_numeric_mean, how='right', on='asin')
df

Unnamed: 0,Accessory,AmazonVideo,AudibleAudiobook,AudioCD,AudioCDLibraryBinding,AudioCassette,BargainBook,BathBook,Blu-ray,Boardbook,...,gr_countDes_after,gr_countText_before,gr_countText_after,am_rank,am_verifiedTrue_count,am_countText_before,am_countText_after,rating_diff,ratings_count,text_reviews_count
0,0,0,0,0,0,0,0,0,0,0,...,66.000000,42320,17834,1810945,1130,69909,31772,8.87,221541,10300
1,0,0,0,0,0,1,0,0,0,0,...,160.689085,158,75,9799161,43,4888,2240,8.56,726,135
2,0,0,0,0,0,0,0,0,0,0,...,160.689085,49,18,321557,30,3085,1326,9.49,266,81
3,0,0,0,0,0,0,0,0,0,0,...,25.000000,130,61,1542999,13,788,399,8.21,2946,92
4,0,0,0,0,0,0,0,0,0,0,...,160.689085,257,117,2884610,69,5667,2574,8.73,845,172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37228,0,0,0,0,0,0,0,0,0,0,...,20.000000,219,94,443719,4,2599,1216,7.49,100,21
37229,0,0,0,0,0,0,0,0,0,0,...,82.000000,125,52,3470182,6,1489,668,6.96,41,16
37230,0,0,0,0,0,0,0,0,0,0,...,33.000000,362,184,3412599,4,1456,683,6.65,70,19
37231,0,0,0,0,0,0,0,0,0,0,...,103.000000,152,76,2606128,9,968,450,7.11,74,19


In [25]:
X =  df.drop('asin', axis=1).drop('rating_diff', axis=1)
Y = df['rating_diff']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.25, random_state=0)
# normalize features
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
# fit model no training data
model = MLPRegressor(hidden_layer_sizes=(14,14,14),max_iter=500)
model.fit(x_train, y_train)
print(model)
# make predictions for train data
y_pred = model.predict(x_train)
print('TRAIN DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_train, y_pred))
print('Mean Squared Error:', mean_squared_error(y_train, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_train, y_pred)))
print('R2 Score:', r2_score(y_train, y_pred))
# make predictions for test data
y_pred = model.predict(x_test)
print('TEST DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2 Score:', r2_score(y_test, y_pred))

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(14, 14, 14), learning_rate='constant',
             learning_rate_init=0.001, max_iter=500, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='adam', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)
TRAIN DATA:
Mean Absolute Error: 0.4346042554378653
Mean Squared Error: 0.3211984920931243
Root Mean Squared Error: 0.5667437622886768
R2 Score: 0.256475004768392
TEST DATA:
Mean Absolute Error: 0.4508439863643207
Mean Squared Error: 0.35570227508048485
Root Mean Squared Error: 0.5964078093724837
R2 Score: 0.1806977679867583


Not worth

# Hyperparameter Tuning

In [26]:
am_gr['gr_genres'] = am_gr['gr_genres'].str.replace(' ','')

In [33]:
df = pd.get_dummies(am_gr['gr_genres'].str.get_dummies(sep=','))
df = pd.concat([df, am_gr['asin']], axis=1)
df = pd.merge(df, am_gr_numeric_mean, how='right', on='asin')
df

Unnamed: 0,biography,children,comics,crime,fantasy,fiction,graphic,historicalfiction,history,mystery,...,gr_countDes_after,gr_countText_before,gr_countText_after,am_rank,am_verifiedTrue_count,am_countText_before,am_countText_after,rating_diff,ratings_count,text_reviews_count
0,0,0,0,0,0,1,0,0,0,0,...,66.000000,42320,17834,1810945,1130,69909,31772,8.87,221541,10300
1,1,0,1,0,0,1,1,1,1,0,...,160.689085,158,75,9799161,43,4888,2240,8.56,726,135
2,0,1,0,0,0,0,0,0,0,0,...,160.689085,49,18,321557,30,3085,1326,9.49,266,81
3,1,1,0,0,0,1,0,1,1,0,...,25.000000,130,61,1542999,13,788,399,8.21,2946,92
4,0,1,0,0,1,1,0,0,0,0,...,160.689085,257,117,2884610,69,5667,2574,8.73,845,172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37228,0,0,0,1,0,0,0,0,0,1,...,20.000000,219,94,443719,4,2599,1216,7.49,100,21
37229,0,0,0,1,0,0,0,0,0,1,...,82.000000,125,52,3470182,6,1489,668,6.96,41,16
37230,0,0,0,1,0,1,0,0,0,1,...,33.000000,362,184,3412599,4,1456,683,6.65,70,19
37231,0,0,0,1,0,0,0,0,0,1,...,103.000000,152,76,2606128,9,968,450,7.11,74,19


In [34]:
X =  df.drop('asin', axis=1).drop('rating_diff', axis=1)
Y = df['rating_diff']

### hidden_layer_sizes

Verdict: Keep it at 14

### activation

In [44]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.25, random_state=0)
# normalize features
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
# fit model no training data
model = MLPRegressor(hidden_layer_sizes=(14,), activation='logistic', max_iter=500)
model.fit(x_train, y_train)
print(model)
# make predictions for train data
y_pred = model.predict(x_train)
print('TRAIN DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_train, y_pred))
print('Mean Squared Error:', mean_squared_error(y_train, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_train, y_pred)))
print('R2 Score:', r2_score(y_train, y_pred))
# make predictions for test data
y_pred = model.predict(x_test)
print('TEST DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2 Score:', r2_score(y_test, y_pred))

MLPRegressor(activation='logistic', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(14,), learning_rate='constant',
             learning_rate_init=0.001, max_iter=500, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='adam', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)
TRAIN DATA:
Mean Absolute Error: 0.4382998434121016
Mean Squared Error: 0.32540503113216607
Root Mean Squared Error: 0.5704428377428943
R2 Score: 0.2467375153469339
TEST DATA:
Mean Absolute Error: 0.4452261839306617
Mean Squared Error: 0.32878905422459986
Root Mean Squared Error: 0.5734013029498624
R2 Score: 0.24268798694980154


Verdict: inconclusive activation = 'relu' or activation = 'logistic'

### solver

In [48]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.25, random_state=0)
# normalize features
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
# fit model no training data
model = MLPRegressor(hidden_layer_sizes=(14,), solver='lbfgs', max_iter=500)
model.fit(x_train, y_train)
print(model)
# make predictions for train data
y_pred = model.predict(x_train)
print('TRAIN DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_train, y_pred))
print('Mean Squared Error:', mean_squared_error(y_train, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_train, y_pred)))
print('R2 Score:', r2_score(y_train, y_pred))
# make predictions for test data
y_pred = model.predict(x_test)
print('TEST DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2 Score:', r2_score(y_test, y_pred))

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(14,), learning_rate='constant',
             learning_rate_init=0.001, max_iter=500, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)
TRAIN DATA:
Mean Absolute Error: 0.43244197931432055
Mean Squared Error: 0.314605932590476
Root Mean Squared Error: 0.5608974350007994
R2 Score: 0.27173576374900876
TEST DATA:
Mean Absolute Error: 0.43968344105456364
Mean Squared Error: 0.32021685920147785
Root Mean Squared Error: 0.5658770707507752
R2 Score: 0.26243264141991285


Verdict: solver='lbfgs'

### alpha

In [56]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.25, random_state=0)
# normalize features
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
# fit model no training data
model = MLPRegressor(hidden_layer_sizes=(14,), solver='lbfgs', alpha=0.0005, max_iter=500)
model.fit(x_train, y_train)
print(model)
# make predictions for train data
y_pred = model.predict(x_train)
print('TRAIN DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_train, y_pred))
print('Mean Squared Error:', mean_squared_error(y_train, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_train, y_pred)))
print('R2 Score:', r2_score(y_train, y_pred))
# make predictions for test data
y_pred = model.predict(x_test)
print('TEST DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2 Score:', r2_score(y_test, y_pred))

MLPRegressor(activation='relu', alpha=0.0005, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(14,), learning_rate='constant',
             learning_rate_init=0.001, max_iter=500, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)
TRAIN DATA:
Mean Absolute Error: 0.4342957421919063
Mean Squared Error: 0.3179861477100168
Root Mean Squared Error: 0.5639026048086822
R2 Score: 0.26391108682023356
TEST DATA:
Mean Absolute Error: 0.43957617059955295
Mean Squared Error: 0.32101651849408974
Root Mean Squared Error: 0.5665831964452261
R2 Score: 0.26059075653700403


Verdict: alpha=0.0005

### learning_rate

Verdict: learning_rate = 'constant'

### max_iter

In [64]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.25, random_state=0)
# normalize features
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
# fit model no training data
model = MLPRegressor(hidden_layer_sizes=(14,), solver='lbfgs', alpha=0.0005, learning_rate='constant', max_iter=1000)
model.fit(x_train, y_train)
print(model)
# make predictions for train data
y_pred = model.predict(x_train)
print('TRAIN DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_train, y_pred))
print('Mean Squared Error:', mean_squared_error(y_train, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_train, y_pred)))
print('R2 Score:', r2_score(y_train, y_pred))
# make predictions for test data
y_pred = model.predict(x_test)
print('TEST DATA:')
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2 Score:', r2_score(y_test, y_pred))

MLPRegressor(activation='relu', alpha=0.0005, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(14,), learning_rate='constant',
             learning_rate_init=0.001, max_iter=1000, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)
TRAIN DATA:
Mean Absolute Error: 0.42810519199127584
Mean Squared Error: 0.30856896105315584
Root Mean Squared Error: 0.5554898388387999
R2 Score: 0.2857104222358795
TEST DATA:
Mean Absolute Error: 0.43637300628821474
Mean Squared Error: 0.3160880399199749
Root Mean Squared Error: 0.5622170754432623
R2 Score: 0.2719427038791683


In [37]:
MLPRegressor(activation='relu', alpha=0.0005, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(14,), learning_rate='constant',
             learning_rate_init=0.001, max_iter=1000, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)
TRAIN DATA:
Mean Absolute Error: 0.42624041835500365
Mean Squared Error: 0.30614686976823974
Root Mean Squared Error: 0.5533054037041747
R2 Score: 0.29131719018591595
TEST DATA:
Mean Absolute Error: 0.43389851684221853
Mean Squared Error: 0.3117942452620169
Root Mean Squared Error: 0.5583853913400824
R2 Score: 0.2818327602367655

"MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,\n             beta_2=0.999, early_stopping=False, epsilon=1e-08,\n             hidden_layer_sizes=(14, 14, 14), learning_rate='constant',\n             learning_rate_init=0.001, max_iter=500, momentum=0.9,\n             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,\n             random_state=None, shuffle=True, solver='adam', tol=0.0001,\n             validation_fraction=0.1, verbose=False, warm_start=False)\nTRAIN DATA:\nMean Absolute Error: 0.4327131302459282\nMean Squared Error: 0.32234701509865354\nRoot Mean Squared Error: 0.5677561229072334\nR2 Score: 0.25381635105976275\nTEST DATA:\nMean Absolute Error: 0.44360506168099145\nMean Squared Error: 0.3317168209759363\nRoot Mean Squared Error: 0.5759486270284324\nR2 Score: 0.235944353292577"