## Data Exploration

In [27]:
#importing necessary imports

In [28]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt

from surprise import Dataset, Reader, accuracy, NormalPredictor, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline, SVD, BaselineOnly, SVDpp, NMF, SlopeOne, CoClustering
from surprise.accuracy import rmse
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
from surprise.prediction_algorithms import SVD, SVDpp, NMF, BaselineOnly, NormalPredictor
from IPython.core.display import HTML

%matplotlib inline

## Exploring Review Data

In [29]:
#reading in our data as a dataframe

In [30]:
df = pd.read_json("Data/reviews_Beauty_5.json.gz",lines=True)

In [31]:
df

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A1YJEY40YUW4SE,7806397051,Andrea,"[3, 4]",Very oily and creamy. Not at all what I expect...,1,Don't waste your money,1391040000,"01 30, 2014"
1,A60XNB876KYML,7806397051,Jessica H.,"[1, 1]",This palette was a decent price and I was look...,3,OK Palette!,1397779200,"04 18, 2014"
2,A3G6XNM240RMWA,7806397051,Karen,"[0, 1]",The texture of this concealer pallet is fantas...,4,great quality,1378425600,"09 6, 2013"
3,A1PQFP6SAJ6D80,7806397051,Norah,"[2, 2]",I really can't tell what exactly this thing is...,2,Do not work on my face,1386460800,"12 8, 2013"
4,A38FVHZTNQ271F,7806397051,Nova Amor,"[0, 0]","It was a little smaller than I expected, but t...",3,It's okay.,1382140800,"10 19, 2013"
...,...,...,...,...,...,...,...,...,...
198497,A2BLFCOPSMBOZ9,B00LLPT4HI,Dave Edmiston,"[0, 0]",Just a little dab of this shea butter should b...,5,A little dab...,1405468800,"07 16, 2014"
198498,A1UQBFCERIP7VJ,B00LLPT4HI,Margaret Picky,"[0, 0]",This shea butter is completely raw and unrefin...,5,Pure organic raw shea butter,1405296000,"07 14, 2014"
198499,A35Q0RBM3YNQNF,B00LLPT4HI,M. Hill,"[0, 0]",The skin is the body's largest organ and it ab...,5,One Pound Organic Grade A Unrefined Shea Butter,1405468800,"07 16, 2014"
198500,A3LGT6UZL99IW1,B00LLPT4HI,"Richard C. Drew ""Anaal Nathra/Uthe vas Bethod...","[0, 0]",I have very dry elbows and knees. I have a to...,5,This stuff is amazing!,1405382400,"07 15, 2014"


In [32]:
#checking for nulls

In [33]:
df.isna().sum()

reviewerID           0
asin                 0
reviewerName      1386
helpful              0
reviewText           0
overall              0
summary              0
unixReviewTime       0
reviewTime           0
dtype: int64

In [34]:
#1386 reviewerNames left blank; we will not need reviewerName since utilizing reviewerID
#checking that all needed information is provided when reviewerName is NaN

In [35]:
df[df['reviewerName'].isnull()]

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
8,A3LMILRM9OC3SA,9759091062,,"[0, 0]",Did nothing for me. Stings when I put it on. I...,2,"no Lightening, no Brightening,......NOTHING",1405209600,"07 13, 2014"
1790,AK1H26O8DLMNN,B0000535UM,,"[0, 0]",The first thickening shampoo that works on my ...,5,Actually works,1405123200,"07 12, 2014"
2242,APTLHR9PHGPXN,B00005NAOD,,"[0, 0]","Kind of drying, not moisturizing. Kind of disa...",2,Inexpensive and feels that wY,1405209600,"07 13, 2014"
2304,AQWX644AFUFFK,B00005NFBD,,"[0, 0]","This is just ok. For one, I found this in a st...",3,"Ok, nothing amazing",1405468800,"07 16, 2014"
3651,A43K5ZRQ87TO6,B00008PC1O,,"[0, 0]",Works well and easy to use!,5,Five Stars,1405296000,"07 14, 2014"
...,...,...,...,...,...,...,...,...,...
197192,A1Z3AV93ONK5VF,B00KAL5JAU,,"[0, 0]",We already had the Dead Sea Shampoo by Adovia ...,5,"Non greasy, silky shiny hair",1401235200,"05 28, 2014"
197193,A184I8GT3BHZQV,B00KAL5JAU,,"[0, 1]",&#60;a href=&#34;http://www.tomoson.com/?code=...,5,Dead Sea Salt hair conditioner,1403568000,"06 24, 2014"
197194,A8C9EJORQD23,B00KAL5JAU,,"[0, 1]",I use this with the Adovia shampoo I mention a...,5,Adovia does natural right!,1402272000,"06 9, 2014"
198446,A2PIGZCDGM4NJ7,B00L5JHZJO,,"[10, 11]","This is a horrible product, most of the review...",1,Bad Product,1404864000,"07 9, 2014"


In [36]:
#checking that all reviewers have completed at least 5 reviews

In [37]:
df.reviewerID.value_counts()

A2V5R832QCSOMX    204
ALNFHVS3SC4FV     192
AKMEY1BSHSDG7     182
A3KEZLJ59C1JVH    154
ALQGOMOY1F5X9     150
                 ... 
A2VV3W7ZVE12LL      5
A31MGUAK2T9IGT      5
A3SDAUMG9QT2X7      5
A13LYG7GITY3D3      5
A2TOJ4R9JAMW3A      5
Name: reviewerID, Length: 22363, dtype: int64

In [38]:
#YAY! all reviewerIDs have value of at least 5, total of 22,363 reviewers

In [39]:
#checking that all products have been reviewed at least 5 times

In [40]:
df.asin.value_counts()

B004OHQR1Q    431
B0043OYFKU    403
B0069FDR96    391
B000ZMBSPE    389
B00150LT40    329
             ... 
B000GCXVS0      5
B001EJOFA2      5
B004KOFORG      5
B0035COC5Y      5
B008UXYQ8G      5
Name: asin, Length: 12101, dtype: int64

In [41]:
#YAY! all products have at least 5 reviews, total of 12,101 different products

In [42]:
#looking at ratings distribution

In [43]:
import os

if not os.path.exists("images"):
    os.mkdir("images")

In [44]:
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
from plotly.io import to_image
init_notebook_mode(connected=True)

data = df['overall'].value_counts().sort_index(ascending=False)
trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / df.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = data.values,
               )
# Create layout
layout = dict(title = 'Distribution Of {} Reviews'.format(df.shape[0]),
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

 # Do this first so we don't create a file if image conversion fails
img_data = to_image(fig,
        format='png',
        width=800,
        height=500,
        scale=5)

fig.write_image("images/reviews_distribution.png", scale=5)


In [45]:
#we see a majority of our ratings are 5s, which could impact our system

In [46]:
#exploring number of ratings per product

In [47]:
data = df.groupby('asin')['overall'].count().clip(upper=50)

In [48]:
data

asin
7806397051     8
9759091062    11
9788072216     5
9790790961     6
9790794231     5
              ..
B00L5KTZ0K    15
B00L6Q3BH6     5
B00LCEROA2     9
B00LG63DOM    10
B00LLPT4HI     7
Name: overall, Length: 12101, dtype: int64

In [49]:
data = df.groupby('asin')['overall'].count()

In [50]:
data

asin
7806397051     8
9759091062    11
9788072216     5
9790790961     6
9790794231     5
              ..
B00L5KTZ0K    15
B00L6Q3BH6     5
B00LCEROA2     9
B00LG63DOM    10
B00LLPT4HI     7
Name: overall, Length: 12101, dtype: int64

In [51]:
# Number of reviews per product
data = df.groupby('asin')['overall'].count()

# Create trace
trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 50,
                                  size = 2))
# Create layout
layout = go.Layout(title = 'Distribution Of Number of Reviews Per Product',
                   xaxis = dict(title = 'Number of Reviews Per Product'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

 # Do this first so we don't create a file if image conversion fails
img_data = to_image(fig,
        format='png',
        width=800,
        height=500,
        scale=5)

fig.write_image("images/reviews_per_product.png", scale=5)


In [52]:
#We see a majority of our projects have 10 or less ratings

In [53]:
#exploring ratings distribution by user

In [55]:
# Number of reviews per user
data = df.groupby('reviewerID')['overall'].count()

trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 50,
                                  size = 2))
# Create layout
layout = go.Layout(title = 'Distribution Of Number of Reviews Per User',
                   xaxis = dict(title = 'Reviews Per User'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

 # Do this first so we don't create a file if image conversion fails
img_data = to_image(fig,
        format='png',
        width=800,
        height=500,
        scale=5)

fig.write_image("images/reviews_per_user.png", scale=5)

In [25]:
#we see most users rated under 10 products

In [26]:
lower_rating = df.overall.min()

In [27]:
upper_rating = df.overall.max()

In [28]:
#Confirming our review range is 1 to 5

In [29]:
print('Review range: {0} to {1}'.format(lower_rating, upper_rating))

Review range: 1 to 5


In [30]:
#Creating dataframe with appropriate columns to run through surprise

In [31]:
surprise_df = df[['reviewerID', 'asin', 'overall']]

In [32]:
surprise_df

Unnamed: 0,reviewerID,asin,overall
0,A1YJEY40YUW4SE,7806397051,1
1,A60XNB876KYML,7806397051,3
2,A3G6XNM240RMWA,7806397051,4
3,A1PQFP6SAJ6D80,7806397051,2
4,A38FVHZTNQ271F,7806397051,3
...,...,...,...
198497,A2BLFCOPSMBOZ9,B00LLPT4HI,5
198498,A1UQBFCERIP7VJ,B00LLPT4HI,5
198499,A35Q0RBM3YNQNF,B00LLPT4HI,5
198500,A3LGT6UZL99IW1,B00LLPT4HI,5


In [33]:
#Checking average rating user to see if there are users who rate everything 1 or 5

In [34]:
avg_rating_user = df.groupby("reviewerID")["overall","reviewerID"].mean().sort_values("overall")
avg_rating_user


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0_level_0,overall
reviewerID,Unnamed: 1_level_1
A1W522Z24EPBJB,1.0
A2DPSPXFJ507C0,1.0
A1GQLVT0SWAWU,1.0
A1KLA02LZXAT46,1.0
A2MHHSACEJANSX,1.0
...,...
A15QGN6UXJVW9G,5.0
ANOJX4RAUJ9HL,5.0
A2RJT3IE2T6KXJ,5.0
A1ORLBQV893JF0,5.0


In [35]:
low_rating_user = avg_rating_user[avg_rating_user["overall"]==1.0]
low_rating_user

Unnamed: 0_level_0,overall
reviewerID,Unnamed: 1_level_1
A1W522Z24EPBJB,1.0
A2DPSPXFJ507C0,1.0
A1GQLVT0SWAWU,1.0
A1KLA02LZXAT46,1.0
A2MHHSACEJANSX,1.0
A2RJTIE73NPN3C,1.0
ASWIC85F71H4J,1.0
A2TBE0N8JN6H4K,1.0
A1GPPMHYM6SMEW,1.0


In [36]:
#only 9 users have rated every product a 1

In [37]:
high_rating_user = avg_rating_user[avg_rating_user["overall"]==5.0]
high_rating_user

Unnamed: 0_level_0,overall
reviewerID,Unnamed: 1_level_1
A2FINIRQNXOTI,5.0
ATWS89FH6Y6S4,5.0
A16Q479PYT0G6N,5.0
A3OKW5VRXZG3OQ,5.0
A3O9Q3154FPZLL,5.0
...,...
A15QGN6UXJVW9G,5.0
ANOJX4RAUJ9HL,5.0
A2RJT3IE2T6KXJ,5.0
A1ORLBQV893JF0,5.0


In [38]:
#2822 users have rated every product a 5

In [39]:
#we decide to keep these users in our final dataset but will not use them to test our model for recommendations

In [40]:
#elaborate on justification for keeping these reviews

## Exploring Meta Data

In [None]:
#Import our meta data 
import gzip

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

In [None]:
meta_data_df = getDF("Data/meta_Beauty.json.gz")
meta_data_df

In [None]:
meta_data_df.head(100)

In [None]:
meta_data_df.isna().sum()

In [None]:
#exploring NaN and deciding which data is helpful to return to our users for recommendations

In [None]:
meta_data_df.shape

In [None]:
from IPython import display
display.Image(meta_data_df.loc[192]["imUrl"])
#display.Image(meta_data_df_cleaned.loc[259179]["imUrl"])

In [None]:
meta_data_df.price.describe()

In [None]:
meta_data_df[meta_data_df['price'] == 999.99]

In [None]:
display.Image(meta_data_df.loc[197364]["imUrl"])

In [None]:
meta_data_df

In [None]:
#renaming columns we plan to return to users for improved aesthetics

In [None]:
meta_data_df.rename(columns={'description':'Description', 'title': 'Product Name', 'asin':'ASIN', 'imUrl':'Image'}, inplace=True)

In [None]:
meta_data_df.head()

In [None]:
meta_data_df.isna().sum()

In [None]:
#dropping brand due to large # of nulls

In [None]:
meta_data_df.drop(columns=['brand'], inplace=True)

## Setting Up Surprise

In [None]:
reader = Reader(rating_scale=(1, 5))
surprise_data = Dataset.load_from_df(surprise_df, reader)

trainset, testset = train_test_split(surprise_data, test_size=0.2, random_state=23)

In [None]:
surprise_data

In [None]:
# How many users and items are in the trainset
print('Number of users: ', trainset.n_users, '\n')
print('Number of items: ', trainset.n_items, '\n')

In [None]:
print('Type trainset :',type(trainset),'\n')
print('Type testset :',type(testset))

## Dummy Model

In [None]:
baseline = NormalPredictor()
baseline.fit(trainset)

In [None]:
predictions = baseline.test(testset)

In [None]:
baseline = accuracy.rmse(predictions)

In [None]:
#baseline RMSE of 1.5002

## Baseline Models

In [None]:
baseline2 = BaselineOnly()
baseline2.fit(trainset)

In [None]:
predictions2 = baseline2.test(testset)

In [None]:
baseline2 = accuracy.rmse(predictions2)

In [None]:
#baseline RMSE of 1.089 utilizing BaselineOnly

In [None]:
als_options = {'method': 'als',
               }
als_baseline = BaselineOnly(bsl_options=als_options)

In [None]:
als_baseline.fit(trainset)

In [None]:
predictions = als_baseline.test(testset)

In [None]:
als_baseline = accuracy.rmse(predictions)

In [None]:
sgd_options = {'method': 'sgd',
               }
sgd_baseline = BaselineOnly(bsl_options=sgd_options)

In [None]:
sgd_baseline.fit(trainset)

In [None]:
predictions = sgd_baseline.test(testset)

In [None]:
sgd_baseline = accuracy.rmse(predictions)

In [None]:
#our baseline model with sgd improved our RMSE to 1.0818

## Iterating Over All Algorithms to Assess Which Models to Further Explore

In [None]:
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, surprise_data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')    

In [None]:
#given our results, we will further explor SVDpp and SVD

## SVD Model Exploration

In [None]:
#Running an SVD model with defaults on trainset

In [None]:
svd = SVD(random_state=42)
svd.fit(trainset)
predictions = svd.test(testset)
print(accuracy.rmse(predictions))

In [None]:
#Checking to see estimated rating for 2 user/product combinations

In [None]:
svd.predict('A1YJEY40YUW4SE', 'B00LLPT4HI')

In [None]:
svd.predict('A2BLFCOPSMBOZ9', '7806397051')

In [None]:
#Cross validate the model

In [None]:
cv_svd_baseline = cross_validate(svd, surprise_data)

In [None]:
cv_svd_baseline

### Attempt on new split

In [None]:
#Hold out 10% of data for validation
#Create a new surpise data class
svd_data = Dataset.load_from_df(surprise_df, reader)
raw_ratings_svd = svd_data.raw_ratings
# A = 90% of the data, B = 10% of the data
threshold = int(.9 * len(raw_ratings_svd))
A_raw_ratings_svd = raw_ratings_svd[:threshold]
B_raw_ratings_svd = raw_ratings_svd[threshold:]

In [None]:
# svd_data is now the set A
svd_data.raw_ratings = A_raw_ratings_svd

In [None]:
#Create a param grid for grid search
SVD_parm_grid = {'n_factors':[20,50,100,150],'n_epochs':[10,20,30],'biased':[True,False]}

In [None]:
#Instantiate our grid search & fit to set A
svd_grid_search = GridSearchCV(algo_class=SVD,param_grid=SVD_parm_grid,measures=['rmse'],cv=5)
svd_grid_search.fit(svd_data)

In [None]:
best_svd_algo = svd_grid_search.best_estimator['rmse']

In [None]:
svd_grid_search.best_params

In [None]:
#{'rmse': {'n_factors': 20, 'n_epochs': 20, 'biased': True}}

In [None]:
# retrain on the whole set A
trainset_svd = svd_data.build_full_trainset()
best_svd_algo.fit(trainset_svd)

In [None]:
predictions = best_svd_algo.test(trainset_svd.build_testset())
print('Biased accuracy on A,', end='   ')
accuracy.rmse(predictions)

In [None]:
# Compute unbiased accuracy on B
testset_svd = svd_data.construct_testset(B_raw_ratings_svd)  # testset is now the set B
predictions = best_svd_algo.test(testset_svd)
print('Unbiased accuracy on B,', end=' ')
accuracy.rmse(predictions)

In [None]:
svd2 = SVD(n_factors=20, n_epochs=20, biased=True, random_state=42)
svd2.fit(trainset)
predictions = svd2.test(testset)
print(accuracy.rmse(predictions))

### Attempt new grid search params with lower n_factors

In [None]:
SVD_parm_grid = {'n_factors':[2,5,10,20],'n_epochs':[10,20,30],'biased':[True,False]}

In [None]:
#Instantiate our grid search & fit to set A
svd_grid_search = GridSearchCV(algo_class=SVD,param_grid=SVD_parm_grid,measures=['rmse'],cv=5)
svd_grid_search.fit(svd_data)

In [None]:
best_svd_algo = svd_grid_search.best_estimator['rmse']

In [None]:
svd_grid_search.best_params

In [None]:
#{'rmse': {'n_factors': 2, 'n_epochs': 20, 'biased': True}}

In [None]:
# retrain on the whole set A
trainset_svd = svd_data.build_full_trainset()
best_svd_algo.fit(trainset_svd)

In [None]:
predictions = best_svd_algo.test(trainset_svd.build_testset())
print('Biased accuracy on A,', end='   ')
accuracy.rmse(predictions)

In [None]:
# Compute unbiased accuracy on B
testset_svd = svd_data.construct_testset(B_raw_ratings_svd)  # testset is now the set B
predictions = best_svd_algo.test(testset_svd)
print('Unbiased accuracy on B,', end=' ')
accuracy.rmse(predictions)

In [None]:
svd3 = SVD(n_factors=2, n_epochs=20, biased=True, random_state=42)
svd3.fit(trainset)
predictions = svd3.test(testset)
print(accuracy.rmse(predictions))

In [None]:
#Same RMSE as sgd_baseline

In [None]:
cv_svd3 = cross_validate(svd3, surprise_data)

In [None]:
cv_svd3

## SVPpp Model Exploration

In [None]:
#Running an SVDpp model with defaults on train 

In [None]:
svdpp = SVDpp(random_state=42)
svdpp.fit(trainset)
predictions = svdpp.test(testset)
print(accuracy.rmse(predictions))

In [None]:
cv_svdpp_baseline = cross_validate(svdpp, surprise_data)

In [None]:
cv_svdpp_baseline

In [None]:
# grid search for SVD++
svdpp_param_grid = {'n_factors':[10, 20],
                    'n_epochs':[20, 30],
                    'reg_all':[0.02, 0.05], 
                    "lr_all": [0.007, 0.005]}
#svdpp_gs_model = GridSearchCV(SVDpp, param_grid=svdpp_param_grid, cv=3, joblib_verbose=10, n_jobs=-1, return_train_measures=True)

# Fit and return the best_params based on cross validation this will take a VERY long time to run
#svdpp_gs_model.fit(surprise_data)
#svdpp_gs_model.best_params['rmse']

In [None]:
#{'n_factors': 10, 'n_epochs': 20, 'reg_all': 0.05, 'lr_all': 0.005}

In [None]:
# Instantiate - fit on trainset - score the model on testset
SVDpp_model = SVDpp(n_factors=10, n_epochs=20, random_state=42, reg_all=0.05, lr_all=.005)
SVDpp_model.fit(trainset)
predictions = SVDpp_model.test(testset)
SVDpp_gs = accuracy.rmse(predictions)

In [None]:
#RMSE: 1.0823

In [None]:
# New dictionary for SVD++
svdpp_param_grid = {'n_factors':[15, 20, 25],
                    'n_epochs':[10, 20 ],
                    'reg_all':[0.02, 0.05, .07], 
                    "lr_all": [0.007, 0.005, .002]}
#svdpp_gs_model = GridSearchCV(SVDpp, param_grid=svdpp_param_grid, cv=3, joblib_verbose=10, n_jobs=-1, return_train_measures=True)

# Fit and return the best_params based on cross validation this will take a VERY long time to run
#svdpp_gs_model.fit(surprise_data)
#svdpp_gs_model.best_params['rmse']

In [None]:
#{'n_factors': 15, 'n_epochs': 20, 'reg_all': 0.07, 'lr_all': 0.005}

In [None]:
# Instantiate - fit on trainset - score the model on testset
SVDpp_model = SVDpp(n_factors=15, n_epochs=20, random_state=42, reg_all=0.07, lr_all=.005)
SVDpp_model.fit(trainset)
predictions = SVDpp_model.test(testset)
SVDpp_gs = accuracy.rmse(predictions)

In [None]:
#RMSE: 1.0824
#still not as good as sgd baseline

In [None]:
df

## Utilizing Best Model to create Recommendations

In [None]:
# Building our trainset_full to fit our final model on full trainset

In [None]:
trainset_full = surprise_data.build_full_trainset()

In [None]:
trainset_full

In [None]:
best_model = SVD(n_factors=2, n_epochs=20, biased=True, random_state=23)
best_model.fit(trainset_full)

In [None]:
## Subset data frame to show reviewers the products they have rated 
df_prior_ratings = pd.DataFrame(df.set_index("reviewerID"))
df_prior_ratings.drop(columns= ["reviewerName", "helpful", "reviewText", "overall", "summary", "unixReviewTime", "reviewTime"], inplace=True)
df_prior_ratings.info()

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
def buyer_recommended_products(): 
    pd.set_option('display.max_colwidth', None)
    buyer = input("reviewerID: ")
    n_recs = int(input("How many recommendations? "))
    
    already_reviewed = list(df_prior_ratings.loc[buyer, "asin"])
    not_reviewed = meta_data_df.copy()
    not_reviewed = not_reviewed[not_reviewed.ASIN.isin(already_reviewed) == False]
    not_reviewed.reset_index(inplace=True)
    not_reviewed["predicted_rating"] = not_reviewed["ASIN"].apply(lambda x: best_model.predict(buyer, x).est)
    not_reviewed.sort_values(by="predicted_rating", ascending=False, inplace=True)
    not_reviewed = not_reviewed[['ASIN','Product Name', 'Description', 'Image']].head(n_recs)

 #Converting links to html tags
    def path_to_image_html(path):
        return '<img src="'+ path + '" width="60" >'
    
    
    return HTML(not_reviewed.to_html(escape=False, formatters=dict(Image=path_to_image_html)))

    
    

In [None]:
buyer_recommended_products()

## Looking into Creating a Recommendation System with an option to add Category of Product 

In [None]:
meta_data_df.categories #which level do we want to go to? 

In [None]:
list(meta_data_df.categories)[:][643][0][1]

In [None]:
#return unique subcategories from meta deta to give user input options for category

In [None]:
subcategories = []
for row in meta_data_df["categories"]:
    value = row[0][1]
    if value not in subcategories:
        subcategories.append(value) 
subcategories

In [None]:
#create a function to extract subcategory level 1 from categories
def get_subcategory(cat):
    value = cat[0][1]
    return(value)

In [None]:
#Create a new column in our meta data df called "sub_cat" containing sub category info
meta_data_df["sub_cat"] = meta_data_df["categories"].apply(get_subcategory)

In [None]:
meta_data_df.head()

In [None]:
def buyer_recommended_category_products(): 
    pd.set_option('display.max_colwidth', None)
    buyer = input("reviewerID: ")
    n_recs = int(input("How many recommendations? "))
    #request_category from subcategories 
    request_category = input("Which category of beauty to recommend buyer? ")
    
    already_reviewed = list(df_prior_ratings.loc[buyer, "asin"])
    not_reviewed = meta_data_df.copy()
    not_reviewed = not_reviewed[not_reviewed.ASIN.isin(already_reviewed) == False]
    not_reviewed.reset_index(inplace=True)
    not_reviewed["predicted_rating"] = not_reviewed["ASIN"].apply(lambda x: best_model.predict(buyer, x).est)
    not_reviewed = not_reviewed[not_reviewed["sub_cat"]==request_category]
    not_reviewed.sort_values(by="predicted_rating", ascending=False, inplace=True)
    not_reviewed = not_reviewed[['ASIN','Product Name', 'Description', 'Image']].head(n_recs)
    
    #Converting links to html tags
    def path_to_image_html(path):
        return '<img src="'+ path + '" width="60" >'
    
    
    return HTML(not_reviewed.to_html(escape=False, formatters=dict(Image=path_to_image_html)))

In [None]:
buyer_recommended_category_products()

In [None]:
#visualizations: demonstrate function, screen record of running the function
#select user, this is what they have previously rated, here's our returned 
#recommended items 

In [None]:
#report RMSE, discuss in terms of rating scale of 1-5

In [None]:
#in presentation report system is based off of x # users, x # reviews, x # of products in reviews

In [None]:
#5 core limitation; system needs user interaction to work, need users to review at least 5 products
#and products to have at least 5 reviews