# Product Recommendation

In [2]:
# Connecting to Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [22]:
import gzip
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD

In [4]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

In [5]:
# Importing Data
data = getDF('/content/drive/MyDrive/capstone_project/data_files/reviews_Health_and_Personal_Care.json.gz')

In [6]:
# Review Data Head
data.head(2)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,ARMDSTEI0Z7YW,77614992,dodo,"[0, 0]",This book was a requirement for a college clas...,5.0,great,1360886400,"02 15, 2013"
1,A3FYN0SZYWN74,615208479,Marilyn Mitzel,"[0, 0]",This is a great gift for anyone who wants to h...,5.0,AMAZING HOW QUICKLY IT WORKS!,1228089600,"12 1, 2008"


In [7]:
# Rating Details
rating = data.overall.unique()
print('Ratings: ',rating)
print('Len of Ratings: ',len(rating))

Ratings:  [5. 2. 4. 1. 3.]
Len of Ratings:  5


In [8]:
# Reviewer ID Details
id = data.reviewerID.unique()
print('Reviewer ID: ',id)
print('Len of Reviewer ID: ',len(id))

Reviewer ID:  ['ARMDSTEI0Z7YW' 'A3FYN0SZYWN74' 'A2J0WRZSAAHUAP' ... 'A1252ETWUJRKVC'
 'A215W0EOUNL81C' 'A2ZFFXGLJUHD76']
Len of Reviewer ID:  1851132


In [9]:
# Product ID
pro = data.asin.unique()
print('Products: ',pro)
print('Len of Products: ',len(pro))

Products:  ['0077614992' '0615208479' '0615269990' ... 'B00LV4480W' 'B00LWTTO0A'
 'B00LYPUPZK']
Len of Products:  252331


In [10]:
# Product which were sold more
product_sales = data.groupby('asin')['overall'].count().sort_values(ascending=False)
product_sales.head()

asin
B001KXZ808    11365
B0032TNPOE     9338
B0095PZHPE     6788
B00B5H5BGA     5739
B0000U1OCI     5550
Name: overall, dtype: int64

In [11]:
# Reviewer written more
rev_sales = data.groupby('reviewerID')['overall'].count().sort_values(ascending=False)
rev_sales.head()

reviewerID
A3OXHLG6DIBRW8    348
A1P27BGF8NAI29    311
A3NHUQ33CFH3VM    307
A1UQBFCERIP7VJ    285
A34BZM6S9L7QI4    278
Name: overall, dtype: int64

In [12]:
# Filtering the Data which has more than 50 items sold
new_df = data.groupby("asin").filter(lambda x:x['overall'].count() >= 50)

In [13]:
# Data Head
new_df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
350,A1LFEMC0GGOJ3X,159985130X,7 Zion,"[1, 1]",We recommend the Magnifier. We are able to rea...,4.0,The Lightwedge Lighted Pocket Magnifier meets ...,1353715200,"11 24, 2012"
351,A17NW9ZCVHYBS1,159985130X,Amazon Customer,"[2, 2]",So convenient and small. I love the LED light...,5.0,Convenient!,1279929600,"07 24, 2010"
352,AA5OQ4JR4CCV3,159985130X,"Amazon Customer ""Timmyg""","[1, 1]",Thank god for Amazon. I can never find things ...,5.0,A real lifesaver,1268784000,"03 17, 2010"
353,A1HP21ZBOYRSU,159985130X,Amazonmama,"[2, 2]",I just love this little gadget. Sometimes my r...,5.0,Just what I needed!,1301270400,"03 28, 2011"
354,ALC5GH8CAMAI7,159985130X,AnnN,"[1, 1]",This is a great little gadget to have around. ...,5.0,Handy little gadget,1294185600,"01 5, 2011"


In [14]:
# Structure of Dataset
new_df.shape

(1731327, 9)

In [15]:
# Slicing the Data
new_df_1 = new_df[:100000]

In [16]:
# Creating Pivot table for Checking Correlation
rating_matrix = new_df_1.pivot_table(index='asin',columns='reviewerID',values='overall',fill_value=0)

In [17]:
# Data Head
rating_matrix.head()

reviewerID,A0009478CBXKUCALUC7U,A005011233SVRED9Q0VY0,A00878297VYRVLFM06AG,A00951093TVGQWA8EA8DB,A015565634RZNSDLJBE5M,A01811021PZ9TH392P21J,A01818243PXJHK1ZH4HUM,A01836621IAABVFLY7Z80,A01866161OQ1BJPZE8D00,A0193442L7IJXD7HANDM,...,AZZCT56PUXILB,AZZDHYOZS7M3B,AZZGJ2KMWB7R,AZZMINCJAD6JM,AZZMO52V8WZ68,AZZNK89PXD006,AZZTH6DJ0KSIP,AZZV9PDNMCOZW,AZZY3B308E3UB,AZZYO4XQYE89O
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
159985130X,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1933622865,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3812028492,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6182055936,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7884890364,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# Structure of Pivot Table
rating_matrix.shape

(575, 94786)

In [19]:
# Dimensionality Reduction
svd = TruncatedSVD(n_components=20)
dim_data = svd.fit_transform(rating_matrix)

In [20]:
# Structure of SVD Data
dim_data.shape

(575, 20)

In [23]:
# Correlation Matrix
corr_matrix = np.corrcoef(dim_data)

In [24]:
# Shape of Correlation Matrix
corr_matrix.shape

(575, 575)

In [25]:
# Taking Random Product for Selection
i = rating_matrix.index[350]

In [26]:
# Displaying the Selected Product ID
i

'B0000Y3F6W'

In [27]:
# Taking Correlation Matrix Index into List
lst = list(rating_matrix.index)

In [28]:
# Finding Index of Selected Product ID in List
j = lst.index(i)

In [29]:
# Selection of Columns of Index of Product ID
correlation_product_ID = corr_matrix[j]

In [30]:
# Fetching Correlated Product ID's

Recommend = list(rating_matrix.index[correlation_product_ID > 0.8])

Recommend.remove(i)

In [31]:
print(Recommend)

['B0000532YB', 'B00008MOQA', 'B0000ASAYY', 'B0000DAPGS']
