# Problem Statement: Personalized Recommendation System

In [1]:
#importing necessary packages
import pandas as pd
import numpy as np
from scipy.spatial.distance import pdist, squareform



NOTE: 
to run this dataset in the local machine, please add the amazon us review dataset (ref: https://www.kaggle.com/datasets/cynthiarempel/amazon-us-customer-reviews-dataset)
as we are not attaching the dataset to the submission file due to its large size.


In [2]:
%%capture
#importing data
data=pd.read_csv("/kaggle/input/amazon-us-customer-reviews-dataset/amazon_reviews_us_Apparel_v1_00.tsv",sep='\t', error_bad_lines=False,engine='python');

In [3]:
%%capture
data1=pd.read_csv("/kaggle/input/amazon-us-customer-reviews-dataset/amazon_reviews_us_Books_v1_02.tsv",sep='\t', error_bad_lines=False,engine='python');
data.append(data1)

In [4]:
%%capture
data1=pd.read_csv("/kaggle/input/amazon-us-customer-reviews-dataset/amazon_reviews_us_Electronics_v1_00.tsv",sep='\t', error_bad_lines=False,engine='python');
data.append(data1)

In [5]:
%%capture
data1=pd.read_csv("/kaggle/input/amazon-us-customer-reviews-dataset/amazon_reviews_us_Music_v1_00.tsv",sep='\t', error_bad_lines=False,engine='python');
data.append(data1)

In [6]:
%%capture
data1=pd.read_csv("/kaggle/input/amazon-us-customer-reviews-dataset/amazon_reviews_us_Office_Products_v1_00.tsv",sep='\t', error_bad_lines=False,engine='python');
data.append(data1)

# Exploratory Data Analysis

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5877663 entries, 0 to 5877662
Data columns (total 15 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   marketplace        object
 1   customer_id        int64 
 2   review_id          object
 3   product_id         object
 4   product_parent     int64 
 5   product_title      object
 6   product_category   object
 7   star_rating        int64 
 8   helpful_votes      int64 
 9   total_votes        int64 
 10  vine               object
 11  verified_purchase  object
 12  review_headline    object
 13  review_body        object
 14  review_date        object
dtypes: int64(5), object(10)
memory usage: 672.6+ MB


In [9]:
data=data[['customer_id','product_id','product_parent','product_title','star_rating']]

In [10]:
data.describe()

Unnamed: 0,customer_id,product_parent,star_rating
count,5877663.0,5877663.0,5877663.0
mean,25385640.0,502887400.0,4.105272
std,15762900.0,288945600.0,1.259181
min,10003.0,665.0,1.0
25%,12392820.0,252053100.0,4.0
50%,23364890.0,504838800.0,5.0
75%,40131140.0,755140500.0,5.0
max,53096560.0,999997900.0,5.0


In [11]:
#selecting products with more than 15 ratings accounting for popularity
reviews=data.groupby('product_id').agg({'star_rating':[np.size]})
max_reviews=reviews['star_rating']['size']>15

reviews=reviews[max_reviews]
reviews=reviews.sample(n=15000)

tempdata=data[data['product_id'].isin(reviews.index)]

In [12]:
for name in tempdata.columns:
    print(name,tempdata[name].nunique())

customer_id 492302
product_id 15000
product_parent 10209
product_title 10143
star_rating 5


In [13]:
reviews=tempdata.groupby('customer_id').agg({'product_id':[np.size]})
max_reviews=reviews['product_id']['size']>3

reviews=reviews[max_reviews]

tempdata=tempdata[tempdata['customer_id'].isin(reviews.index)]

In [14]:
tempdata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14929 entries, 21504 to 5876803
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   customer_id     14929 non-null  int64 
 1   product_id      14929 non-null  object
 2   product_parent  14929 non-null  int64 
 3   product_title   14929 non-null  object
 4   star_rating     14929 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 699.8+ KB


In [15]:
for name in tempdata.columns:
    print(name,tempdata[name].nunique())

customer_id 3157
product_id 7270
product_parent 5595
product_title 5567
star_rating 5


In [16]:
#sample datapoints
print(tempdata.head())
print(tempdata.tail())

       customer_id  product_id  product_parent  \
21504      2371312  B00ZEOAJSK       393453593   
34187     46254850  B00YD4ZHQ6       118852195   
34194      4090438  B00YD4ZHQ6       118852195   
34195     28922536  B00YD4ZHQ6       118852195   
34197     45810163  B00YD4ZHQ6       118852195   

                                           product_title  star_rating  
21504  Josi Minea Women's Beautiful Sexy Lingerie Set...            5  
34187        Kangaroo Flapper Beads - 48" Pearl Necklace            5  
34194        Kangaroo Flapper Beads - 48" Pearl Necklace            4  
34195        Kangaroo Flapper Beads - 48" Pearl Necklace            5  
34197        Kangaroo Flapper Beads - 48" Pearl Necklace            4  
         customer_id  product_id  product_parent  \
5869087     34306412  B0000AIXZ6       935960437   
5871252     52177870  B00008KH9C       612111540   
5871256     40581989  B00008KH9C       612111540   
5876472     12009813  B00004U1NW       508880024   
5876803

# filtering items based on similarity

In [17]:
#creating interaction matrix
rating_matrix=tempdata.pivot_table(index=['product_id'],columns=['customer_id'],values='star_rating')

In [18]:
rating_matrix.head()

customer_id,19251,55039,76286,85748,276434,286596,386828,415252,470633,516980,...,52856364,52911016,52938899,52970886,52993168,53029130,53043525,53062722,53066250,53086969
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B00001QHXY,,,,,,,,,,,...,,,,,,,,,,
B00004U1NW,,,,,,,,,,,...,,,,,,,,,,
B00008KH9C,,,,,,,,,,,...,,,,,,,,,,
B0000AIXZ6,,,,,,,,,,,...,,,,,,,,,,
B0000ANHT7,,,,,,,,,,,...,,,,,,,,,,


In [19]:
#filling explicit data interactions for accounting user interactions
# 0 means the user have not interacted with the product
rating_temp=rating_matrix.dropna(axis=1,how="all")
rating_temp=rating_matrix.apply(lambda x: x - np.nanmean(x), axis=1)
rating_temp.fillna(0,inplace = True)

In [20]:
rating_temp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7270 entries, B00001QHXY to B00ZEOAJSK
Columns: 3157 entries, 19251 to 53086969
dtypes: float64(3157)
memory usage: 175.2+ MB


In [21]:
#calculating item similarity with every other item
manhattan_distances = pdist(rating_temp.values, metric='cityblock')

In [22]:
distance_matrix = squareform(manhattan_distances)

In [23]:
#creating data points of item similarity
item_similar = pd.DataFrame(distance_matrix, index=rating_matrix.index, columns=rating_matrix.index)

In [25]:
item_similar.head()

product_id,B00001QHXY,B00004U1NW,B00008KH9C,B0000AIXZ6,B0000ANHT7,B0000DZJL4,B0000WL3CW,B0000YEC8C,B0000YRQEE,B0000ZCSVY,...,B00X3MNY6Y,B00X5KW5QE,B00X5O6B60,B00X93XYVM,B00XJFE1LM,B00XOKG4BC,B00Y9GMRXE,B00YCT4QO6,B00YD4ZHQ6,B00ZEOAJSK
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B00001QHXY,0.0,0.0,4.0,1.0,2.0,0.0,0.0,2.666667,0.0,0.0,...,0.0,1.75,0.0,0.0,0.0,0.0,2.0,0.0,2.4,0.0
B00004U1NW,0.0,0.0,4.0,1.0,2.0,0.0,0.0,2.666667,0.0,0.0,...,0.0,1.75,0.0,0.0,0.0,0.0,2.0,0.0,2.4,0.0
B00008KH9C,4.0,4.0,0.0,5.0,6.0,4.0,4.0,6.666667,4.0,4.0,...,4.0,5.75,4.0,4.0,4.0,4.0,6.0,4.0,6.4,4.0
B0000AIXZ6,1.0,1.0,5.0,0.0,3.0,1.0,1.0,3.666667,1.0,1.0,...,1.0,2.75,1.0,1.0,1.0,1.0,3.0,1.0,3.4,1.0
B0000ANHT7,2.0,2.0,6.0,3.0,0.0,2.0,2.0,4.666667,2.0,2.0,...,2.0,3.75,2.0,2.0,2.0,2.0,4.0,2.0,4.4,2.0


In [30]:
#seperating user data
customer_id=int(input("enter your customer id: "))
customer_data=tempdata[tempdata['customer_id']==customer_id]

product=input("enter preferences for required product: ")
product_found=tempdata[tempdata['product_title'].str.contains(product, case=False)]

enter your customer id:  4090438
enter preferences for required product:  shirt


In [31]:
#finding products with user preferences
product_found=product_found.drop_duplicates(subset='product_title')

In [32]:
# retrieving user past data
user_reviewed_items=pd.DataFrame(rating_matrix[customer_id].dropna(axis=0, how='all')\
                          .sort_values(ascending=False))\
                          .reset_index()\
                          .rename(columns={1:'rating'})

In [33]:
user_reviewed_items

Unnamed: 0,product_id,4090438
0,B00BG0B0KM,5.0
1,B00SUMGC5M,4.0
2,B00SUVRGB2,4.0
3,B00SVZWB7Q,4.0
4,B00YD4ZHQ6,4.0


In [34]:
product_average_rating=tempdata.groupby("product_id").agg({"star_rating":[np.mean]})
product_average_rating

Unnamed: 0_level_0,star_rating
Unnamed: 0_level_1,mean
product_id,Unnamed: 1_level_2
B00001QHXY,3.0
B00004U1NW,5.0
B00008KH9C,3.0
B0000AIXZ6,4.5
B0000ANHT7,4.0
...,...
B00XOKG4BC,5.0
B00Y9GMRXE,4.0
B00YCT4QO6,5.0
B00YD4ZHQ6,4.6


In [36]:
#generating recommendations based on similarity score(inverse manhattan) 
#and average product rating


product_found_id=product_found['product_id']
suggestions_list=[]
same_product=[]
for pid in product_found_id:
    final_score=0
    for uid in user_reviewed_items['product_id']:
        similarity_score=item_similar[pid][uid]
        avg=1
        try:
                avg=product_average_rating[product_average_rating['product_id'==pid]]['star_rating','mean']
        except:
                pass
        else:
                avg=1
                
        if similarity_score>0:    
            final_score+=(1/similarity_score)*avg
        else:
            final_score+=1*avg
            
    suggestions_list.append(final_score)
    
#
   
product_found['score']=suggestions_list

product_found=product_found.sort_values(by='score',ascending=False)

final_list=product_found[['product_title','score']]



In [37]:
print("list of recommendations for the preference: ",product)
final_list.reset_index()

list of recommendations for the preference:  shirt


Unnamed: 0,index,product_title,score
0,3063965,Ripple Junction Big Bang Theory I'm Not Insane...,2.625000
1,4347837,Calvin Klein Men's Non Iron Slim Fit Solid Dr...,2.625000
2,2307318,Active Basic Athletic Fitted Plain Long Sleeve...,2.625000
3,1171774,Happy Family Clothing Little Boys' Superhero S...,2.625000
4,1178121,Icibgoods Women Lace Floral Sleeveless Crochet...,2.625000
...,...,...,...
534,2712147,Allegra K Women Stretchy Mesh Splice Long Batw...,0.411792
535,788393,Women Ladies Long Sleeve Embroidered Chiffon C...,0.395464
536,2396090,Allegra K Ladies Pullover Batwing Sleeve Zigza...,0.309593
537,811917,Pretty Show Women's Long-sleeve Print Chiffon ...,0.225665


In [38]:
import random
#generating sample user interactions
random_list=list(data['product_id'].unique())

random_list=random_list[:50]
random_list=random_list+(product_found_id.to_list())

user_interacted_id=random.sample(random_list,20)

user_interacted_id

['B014M5N1WW',
 'B00B2PS9GE',
 'B004GGUAH2',
 'B002NPCIP0',
 'B009Z3YTGY',
 'B00KBZQRMY',
 'B00B01HPBU',
 'B00UT67ZLC',
 'B00RL67OOQ',
 'B00BLW2TPA',
 'B00LA543K2',
 'B004BDOPVM',
 'B0043RJQ1U',
 'B00FP87Z3W',
 'B00NPV6H8U',
 'B00CEH2MI0',
 'B007RGCZWQ',
 'B005IDUOU0',
 'B005DIJKA0',
 'B00HJ1PZP8']

In [40]:
#metric calculation
recommended_id=list(product_found['product_id']) 
hits= sum(1 for pid in user_interacted_id if pid in recommended_id )

hit_ratio=hits/20

print("hit ratio by user: ",hit_ratio)

hit ratio by user:  0.95
