## Importing relevant libraries 

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import random
from scipy.sparse.linalg import svds
import warnings; warnings.simplefilter('ignore')

## Extracting Dataset 

In [2]:
elec_df = pd.read_csv("electronics.csv")
cloth_df = pd.read_csv("modcloth.csv")

In [3]:
elec_df.head(10)

Unnamed: 0,item_id,user_id,rating,timestamp,model_attr,category,brand,year,user_attr,split
0,0,0,5.0,1999-06-13,Female,Portable Audio & Video,,1999,,0
1,0,1,5.0,1999-06-14,Female,Portable Audio & Video,,1999,,0
2,0,2,3.0,1999-06-17,Female,Portable Audio & Video,,1999,,0
3,0,3,1.0,1999-07-01,Female,Portable Audio & Video,,1999,,0
4,0,4,2.0,1999-07-06,Female,Portable Audio & Video,,1999,,0
5,0,5,2.0,1999-07-12,Female,Portable Audio & Video,,1999,,0
6,0,6,5.0,1999-07-13,Female,Portable Audio & Video,,1999,,0
7,0,7,2.0,1999-07-13,Female,Portable Audio & Video,,1999,,0
8,0,8,4.0,1999-07-16,Female,Portable Audio & Video,,1999,,0
9,0,9,5.0,1999-08-20,Female,Portable Audio & Video,,1999,,0


In [4]:
print('shape:', elec_df.shape,'\n')
elec_df.info()

shape: (1292954, 10) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1292954 entries, 0 to 1292953
Data columns (total 10 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   item_id     1292954 non-null  int64  
 1   user_id     1292954 non-null  int64  
 2   rating      1292954 non-null  float64
 3   timestamp   1292954 non-null  object 
 4   model_attr  1292954 non-null  object 
 5   category    1292954 non-null  object 
 6   brand       331120 non-null   object 
 7   year        1292954 non-null  int64  
 8   user_attr   174124 non-null   object 
 9   split       1292954 non-null  int64  
dtypes: float64(1), int64(4), object(5)
memory usage: 98.6+ MB


In [5]:
cloth_df.head(10)

Unnamed: 0,item_id,user_id,rating,timestamp,size,fit,user_attr,model_attr,category,brand,year,split
0,7443,Alex,4,2010-01-21 08:00:00+00:00,,,Small,Small,Dresses,,2012,0
1,7443,carolyn.agan,3,2010-01-27 08:00:00+00:00,,,,Small,Dresses,,2012,0
2,7443,Robyn,4,2010-01-29 08:00:00+00:00,,,Small,Small,Dresses,,2012,0
3,7443,De,4,2010-02-13 08:00:00+00:00,,,,Small,Dresses,,2012,0
4,7443,tasha,4,2010-02-18 08:00:00+00:00,,,Small,Small,Dresses,,2012,0
5,7443,gina.chihos,5,2010-02-25 08:00:00+00:00,,,,Small,Dresses,,2012,0
6,7443,Kim,2,2010-02-26 08:00:00+00:00,,,Small,Small,Dresses,,2012,0
7,7443,jess.betcher,5,2010-03-26 07:00:00+00:00,,,,Small,Dresses,,2012,0
8,7443,Elissa,4,2010-04-06 07:00:00+00:00,,,Small,Small,Dresses,,2012,0
9,7443,Yvette,4,2010-04-08 07:00:00+00:00,,,Large,Small,Dresses,,2012,0


In [6]:
print('shape',cloth_df.shape,'\n')
cloth_df.info()

shape (99893, 12) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99893 entries, 0 to 99892
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   item_id     99893 non-null  int64  
 1   user_id     99892 non-null  object 
 2   rating      99893 non-null  int64  
 3   timestamp   99893 non-null  object 
 4   size        78133 non-null  float64
 5   fit         81387 non-null  object 
 6   user_attr   91526 non-null  object 
 7   model_attr  99893 non-null  object 
 8   category    99893 non-null  object 
 9   brand       25913 non-null  object 
 10  year        99893 non-null  int64  
 11  split       99893 non-null  int64  
dtypes: float64(1), int64(4), object(7)
memory usage: 9.1+ MB


###  Replacing user_id with Numbers

In [7]:
encode={}
li=[]
# last user_id of electronics dataset
last_id=1157632
for i in cloth_df['user_id']:
    if i in encode.keys():
        li.append(encode[i])
        continue
    last_id+=1
    encode[i]=last_id
    li.append(last_id)
cloth_df['user_id']=li
cloth_df.head()

Unnamed: 0,item_id,user_id,rating,timestamp,size,fit,user_attr,model_attr,category,brand,year,split
0,7443,1157633,4,2010-01-21 08:00:00+00:00,,,Small,Small,Dresses,,2012,0
1,7443,1157634,3,2010-01-27 08:00:00+00:00,,,,Small,Dresses,,2012,0
2,7443,1157635,4,2010-01-29 08:00:00+00:00,,,Small,Small,Dresses,,2012,0
3,7443,1157636,4,2010-02-13 08:00:00+00:00,,,,Small,Dresses,,2012,0
4,7443,1157637,4,2010-02-18 08:00:00+00:00,,,Small,Small,Dresses,,2012,0


In [8]:
elec_df_pro = elec_df[['item_id','user_id','rating','category']].copy()
cloth_df_pro = cloth_df[['item_id','user_id','rating','category']].copy()
print('Shape of electronics data set:', elec_df_pro.shape)
print('Shape of modcloth data set:', cloth_df_pro.shape)

Shape of electronics data set: (1292954, 4)
Shape of modcloth data set: (99893, 4)


###  Combining these Datasets

In [9]:
products_df = pd.concat([elec_df_pro,cloth_df_pro],ignore_index=True)

In [10]:
products_df.head(10)

Unnamed: 0,item_id,user_id,rating,category
0,0,0,5.0,Portable Audio & Video
1,0,1,5.0,Portable Audio & Video
2,0,2,3.0,Portable Audio & Video
3,0,3,1.0,Portable Audio & Video
4,0,4,2.0,Portable Audio & Video
5,0,5,2.0,Portable Audio & Video
6,0,6,5.0,Portable Audio & Video
7,0,7,2.0,Portable Audio & Video
8,0,8,4.0,Portable Audio & Video
9,0,9,5.0,Portable Audio & Video


In [11]:
products_df.describe()['rating']

count    1.392847e+06
mean     4.062822e+00
std      1.361083e+00
min      1.000000e+00
25%      4.000000e+00
50%      5.000000e+00
75%      5.000000e+00
max      5.000000e+00
Name: rating, dtype: float64

In [12]:
products_df.head()

Unnamed: 0,item_id,user_id,rating,category
0,0,0,5.0,Portable Audio & Video
1,0,1,5.0,Portable Audio & Video
2,0,2,3.0,Portable Audio & Video
3,0,3,1.0,Portable Audio & Video
4,0,4,2.0,Portable Audio & Video


In [13]:
products_df.isnull().sum()

item_id     0
user_id     0
rating      0
category    0
dtype: int64

In [14]:
products_df['user_id'].duplicated().sum()

190430

### Data Preprocessing

In [15]:
products_df.groupby('category').count()['rating']

category
Accessories & Supplies     158598
Bottoms                     23625
Camera & Photo             192573
Car Electronics & GPS       33070
Computers & Accessories    322938
Dresses                     34160
Headphones                 359334
Home Audio                  24193
Outerwear                    7131
Portable Audio & Video     143370
Security & Surveillance     10806
Television & Video          32057
Tops                        34977
Wearable Technology         16015
Name: rating, dtype: int64

In [16]:
products_df.shape

(1392847, 4)

In [17]:
num_products_df = products_df.groupby('category').count()['rating'].reset_index()

In [18]:
num_products_df

Unnamed: 0,category,rating
0,Accessories & Supplies,158598
1,Bottoms,23625
2,Camera & Photo,192573
3,Car Electronics & GPS,33070
4,Computers & Accessories,322938
5,Dresses,34160
6,Headphones,359334
7,Home Audio,24193
8,Outerwear,7131
9,Portable Audio & Video,143370


In [19]:
num_products_df.rename(columns={'rating':'num_ratings'},inplace=True)
num_products_df

Unnamed: 0,category,num_ratings
0,Accessories & Supplies,158598
1,Bottoms,23625
2,Camera & Photo,192573
3,Car Electronics & GPS,33070
4,Computers & Accessories,322938
5,Dresses,34160
6,Headphones,359334
7,Home Audio,24193
8,Outerwear,7131
9,Portable Audio & Video,143370


In [20]:
avg_products_df = products_df.groupby('category').mean()['rating'].reset_index()
avg_products_df.rename(columns={'rating':'avg_ratings'},inplace=True)
avg_products_df

Unnamed: 0,category,avg_ratings
0,Accessories & Supplies,4.257784
1,Bottoms,4.185228
2,Camera & Photo,4.129795
3,Car Electronics & GPS,4.062897
4,Computers & Accessories,4.101871
5,Dresses,4.230562
6,Headphones,3.895877
7,Home Audio,4.04741
8,Outerwear,4.217361
9,Portable Audio & Video,4.122996


In [21]:
popular_df = num_products_df.merge(avg_products_df,on='category')
popular_df

Unnamed: 0,category,num_ratings,avg_ratings
0,Accessories & Supplies,158598,4.257784
1,Bottoms,23625,4.185228
2,Camera & Photo,192573,4.129795
3,Car Electronics & GPS,33070,4.062897
4,Computers & Accessories,322938,4.101871
5,Dresses,34160,4.230562
6,Headphones,359334,3.895877
7,Home Audio,24193,4.04741
8,Outerwear,7131,4.217361
9,Portable Audio & Video,143370,4.122996


In [22]:
tp_df = popular_df[popular_df['num_ratings']>100000].sort_values('avg_ratings',ascending=False)['category'].tolist()

In [23]:
tp_df

['Accessories & Supplies',
 'Camera & Photo',
 'Portable Audio & Video',
 'Computers & Accessories',
 'Headphones']

In [24]:
products_df.groupby('user_id').count()

Unnamed: 0_level_0,item_id,rating,category
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,1,1
1,1,1,1
2,1,1,1
3,1,1,1
4,1,1,1
...,...,...,...
1202412,1,1,1
1202413,1,1,1
1202414,1,1,1
1202415,1,1,1


In [25]:
products_df.groupby('user_id').count().sort_values('rating',ascending=False)

Unnamed: 0_level_0,item_id,rating,category
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1160256,250,250,250
1161835,204,204,204
1159573,198,198,198
1163125,197,197,197
1160497,191,191,191
...,...,...,...
430998,1,1,1
430999,1,1,1
431000,1,1,1
431001,1,1,1


In [26]:
counts=products_df.user_id.value_counts()
most_rated_users_df=products_df[products_df.user_id.isin(counts[counts>=10].index)]
most_rated_users_df

Unnamed: 0,item_id,user_id,rating,category
28,0,28,2.0,Portable Audio & Video
158,3,158,2.0,Camera & Photo
183,14,158,4.0,Camera & Photo
271,14,269,5.0,Camera & Photo
279,15,269,5.0,Camera & Photo
...,...,...,...,...
1392840,154329,1159908,3.0,Tops
1392843,77949,1159148,4.0,Bottoms
1392844,67194,1158511,5.0,Dresses
1392845,71607,1160569,3.0,Outerwear


In [27]:
most_rated_users_df['category'].unique()

array(['Portable Audio & Video', 'Camera & Photo',
       'Computers & Accessories', 'Headphones', 'Car Electronics & GPS',
       'Home Audio', 'Accessories & Supplies', 'Television & Video',
       'Security & Surveillance', 'Wearable Technology', 'Dresses',
       'Outerwear', 'Bottoms', 'Tops'], dtype=object)

In [28]:
print('Number of users who have rated 10 or more items =', len(most_rated_users_df))
print('Number of unique users in the final data = ', most_rated_users_df['user_id'].nunique())
print('Number of unique products in the final data = ', most_rated_users_df['item_id'].nunique())

Number of users who have rated 10 or more items = 32944
Number of unique users in the final data =  1148
Number of unique products in the final data =  2504


In [29]:
x = products_df.groupby('user_id').count()['rating'] > 10
rating_users = x[x].index

In [30]:
rating_users

Index([    158,    1007,    1277,    1670,    2799,    3054,    3089,    3234,
          3789,    3939,
       ...
       1197124, 1197154, 1197161, 1197181, 1197408, 1197634, 1197637, 1197851,
       1197921, 1198250],
      dtype='int64', name='user_id', length=996)

In [31]:
filtered_ratings = products_df[products_df['user_id'].isin(rating_users)]

In [32]:
filtered_ratings

Unnamed: 0,item_id,user_id,rating,category
158,3,158,2.0,Camera & Photo
183,14,158,4.0,Camera & Photo
332,24,158,5.0,Camera & Photo
350,16,158,3.0,Camera & Photo
569,46,158,3.0,Camera & Photo
...,...,...,...,...
1392840,154329,1159908,3.0,Tops
1392843,77949,1159148,4.0,Bottoms
1392844,67194,1158511,5.0,Dresses
1392845,71607,1160569,3.0,Outerwear


In [33]:
filtered_ratings.groupby('category').count()['rating']

category
Accessories & Supplies       172
Bottoms                     6828
Camera & Photo               428
Car Electronics & GPS         49
Computers & Accessories      600
Dresses                     9451
Headphones                   759
Home Audio                    43
Outerwear                   2107
Portable Audio & Video       362
Security & Surveillance       32
Television & Video            39
Tops                       10541
Wearable Technology           13
Name: rating, dtype: int64

In [34]:
y = filtered_ratings.groupby('category').count()['rating']>=10
famous_df = y[y].index

In [35]:
famous_df

Index(['Accessories & Supplies', 'Bottoms', 'Camera & Photo',
       'Car Electronics & GPS', 'Computers & Accessories', 'Dresses',
       'Headphones', 'Home Audio', 'Outerwear', 'Portable Audio & Video',
       'Security & Surveillance', 'Television & Video', 'Tops',
       'Wearable Technology'],
      dtype='object', name='category')

In [36]:
filtered_ratings[filtered_ratings['category'].isin(famous_df)]

Unnamed: 0,item_id,user_id,rating,category
158,3,158,2.0,Camera & Photo
183,14,158,4.0,Camera & Photo
332,24,158,5.0,Camera & Photo
350,16,158,3.0,Camera & Photo
569,46,158,3.0,Camera & Photo
...,...,...,...,...
1392840,154329,1159908,3.0,Tops
1392843,77949,1159148,4.0,Bottoms
1392844,67194,1158511,5.0,Dresses
1392845,71607,1160569,3.0,Outerwear


In [37]:
final_ratings = filtered_ratings[filtered_ratings['category'].isin(famous_df)]

In [38]:
final_ratings

Unnamed: 0,item_id,user_id,rating,category
158,3,158,2.0,Camera & Photo
183,14,158,4.0,Camera & Photo
332,24,158,5.0,Camera & Photo
350,16,158,3.0,Camera & Photo
569,46,158,3.0,Camera & Photo
...,...,...,...,...
1392840,154329,1159908,3.0,Tops
1392843,77949,1159148,4.0,Bottoms
1392844,67194,1158511,5.0,Dresses
1392845,71607,1160569,3.0,Outerwear


### Training and Testing the Data (Splitting in 80/20 ratio) 

In [77]:
train_data, test_data = train_test_split(most_rated_users_df, test_size = 0.2, random_state=0)
train_data

Unnamed: 0,item_id,user_id,rating,category
1382834,34935,1161914,5.0,Tops
1381624,117276,1196951,5.0,Bottoms
1348264,82288,1158312,3.0,Dresses
99552,2124,2669,4.0,Computers & Accessories
1364596,144572,1162820,5.0,Bottoms
...,...,...,...,...
1367723,106758,1161035,5.0,Dresses
1391193,36172,1158323,5.0,Tops
1387759,152036,1159306,5.0,Tops
1369241,135345,1159168,1.0,Dresses


In [40]:
print('Shape of training data: ',train_data.shape)
print('Shape of testing data: ',test_data.shape)

Shape of training data:  (26355, 4)
Shape of testing data:  (6589, 4)


In [41]:
train_data_grouped = train_data.groupby('category').agg({'user_id': 'count'}).reset_index()
train_data_grouped.rename(columns={'user_id': 'num_ratings'}, inplace=True)
train_data_grouped

Unnamed: 0,category,num_ratings
0,Accessories & Supplies,168
1,Bottoms,5618
2,Camera & Photo,422
3,Car Electronics & GPS,48
4,Computers & Accessories,589
5,Dresses,7809
6,Headphones,742
7,Home Audio,38
8,Outerwear,1737
9,Portable Audio & Video,329


In [81]:
avg_ratings_by_category = train_data.groupby('category')['rating'].mean().reset_index()

train_data_grouped = train_data_grouped.merge(avg_ratings_by_category, on='category', how='left')
train_data_grouped.rename(columns={'rating': 'avg_ratings'}, inplace=True)
train_data_grouped

Unnamed: 0,category,num_ratings,avg_ratings,avg_ratings.1,avg_ratings.2
0,Accessories & Supplies,168,4.583333,4.583333,4.583333
1,Bottoms,5618,4.200249,4.200249,4.200249
2,Camera & Photo,422,4.386256,4.386256,4.386256
3,Car Electronics & GPS,48,4.645833,4.645833,4.645833
4,Computers & Accessories,589,4.429542,4.429542,4.429542
5,Dresses,7809,4.228198,4.228198,4.228198
6,Headphones,742,4.380054,4.380054,4.380054
7,Home Audio,38,4.315789,4.315789,4.315789
8,Outerwear,1737,4.162349,4.162349,4.162349
9,Portable Audio & Video,329,4.43769,4.43769,4.43769


In [89]:
train_data_grouped = train_data.groupby('category').agg({'user_id': 'count', 'rating': 'mean'}).reset_index()
train_data_grouped.rename(columns={'user_id': 'score', 'rating': 'avg_ratings'}, inplace=True)

sorted_train_data = train_data_grouped.sort_values(by=['avg_ratings', 'score'], ascending=[False, False])
sorted_train_data

Unnamed: 0,category,score,avg_ratings
3,Car Electronics & GPS,48,4.645833
0,Accessories & Supplies,168,4.583333
9,Portable Audio & Video,329,4.43769
4,Computers & Accessories,589,4.429542
2,Camera & Photo,422,4.386256
6,Headphones,742,4.380054
7,Home Audio,38,4.315789
11,Television & Video,43,4.255814
5,Dresses,7809,4.228198
12,Tops,8775,4.220627


In [90]:
popular_recommendations = sorted_train_data.head(5) 
popular_recommendations

Unnamed: 0,category,score,avg_ratings
3,Car Electronics & GPS,48,4.645833
0,Accessories & Supplies,168,4.583333
9,Portable Audio & Video,329,4.43769
4,Computers & Accessories,589,4.429542
2,Camera & Photo,422,4.386256


From the above results we can say that we got almost the same popular_recommendations, so the acuracy is quite perfect

In [45]:
cf_ratings = pd.concat([train_data, test_data])
cf_ratings

Unnamed: 0,item_id,user_id,rating,category
1382834,34935,1161914,5.0,Tops
1381624,117276,1196951,5.0,Bottoms
1348264,82288,1158312,3.0,Dresses
99552,2124,2669,4.0,Computers & Accessories
1364596,144572,1162820,5.0,Bottoms
...,...,...,...,...
287824,3146,37749,5.0,Portable Audio & Video
1377721,153711,1157976,5.0,Tops
1306805,34931,1161031,5.0,Tops
1379084,21296,1164981,4.0,Bottoms


In [46]:
pt = final_ratings.pivot_table(index = 'category',columns='user_id',values='rating')

In [47]:
pt

user_id,158,1007,1277,1670,2799,3054,3089,3234,3789,3939,...,1197124,1197154,1197161,1197181,1197408,1197634,1197637,1197851,1197921,1198250
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Accessories & Supplies,,,,4.0,5.0,5.0,4.0,,5.0,,...,,,,,,,,,,
Bottoms,,,,,,,,,,,...,,4.2,5.0,5.0,4.666667,5.0,2.25,4.5,,4.0
Camera & Photo,3.909091,5.0,4.5,4.8,4.0,5.0,4.5,4.0,5.0,5.0,...,,,,,,,,,,
Car Electronics & GPS,,,,,,,4.0,,,,...,,,,,,,,,,
Computers & Accessories,5.0,4.25,3.857143,4.8,5.0,5.0,4.0,4.333333,4.333333,4.0,...,,,,,,,,,,
Dresses,,,,,,,,,,,...,4.2,3.666667,4.833333,4.0,4.333333,2.666667,3.0,4.428571,4.5,4.3
Headphones,,3.4,,4.0,5.0,4.357143,4.142857,4.5,4.166667,4.545455,...,,,,,,,,,,
Home Audio,,,,5.0,,,4.0,,,5.0,...,,,,,,,,,,
Outerwear,,,,,,,,,,,...,4.333333,,5.0,4.0,4.5,5.0,2.5,4.0,3.5,5.0
Portable Audio & Video,,,,3.0,5.0,3.0,4.0,4.0,,4.5,...,,,,,,,,,,


In [48]:
pt.fillna(0,inplace=True)

In [49]:
pt

user_id,158,1007,1277,1670,2799,3054,3089,3234,3789,3939,...,1197124,1197154,1197161,1197181,1197408,1197634,1197637,1197851,1197921,1198250
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Accessories & Supplies,0.0,0.0,0.0,4.0,5.0,5.0,4.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bottoms,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.2,5.0,5.0,4.666667,5.0,2.25,4.5,0.0,4.0
Camera & Photo,3.909091,5.0,4.5,4.8,4.0,5.0,4.5,4.0,5.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Car Electronics & GPS,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Computers & Accessories,5.0,4.25,3.857143,4.8,5.0,5.0,4.0,4.333333,4.333333,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Dresses,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.2,3.666667,4.833333,4.0,4.333333,2.666667,3.0,4.428571,4.5,4.3
Headphones,0.0,3.4,0.0,4.0,5.0,4.357143,4.142857,4.5,4.166667,4.545455,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Home Audio,0.0,0.0,0.0,5.0,0.0,0.0,4.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Outerwear,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.333333,0.0,5.0,4.0,4.5,5.0,2.5,4.0,3.5,5.0
Portable Audio & Video,0.0,0.0,0.0,3.0,5.0,3.0,4.0,4.0,0.0,4.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
print('Shape of the pivot table: ', pt.shape)

Shape of the pivot table:  (14, 996)


In [51]:
from sklearn.metrics.pairwise import cosine_similarity

In [52]:
similarity_scores = cosine_similarity(pt)

In [53]:
similarity_scores

array([[1.        , 0.        , 0.71327923, 0.46582992, 0.79011252,
        0.        , 0.79430571, 0.37092903, 0.        , 0.71945899,
        0.23777138, 0.36147274, 0.        , 0.08255641],
       [0.        , 1.        , 0.        , 0.        , 0.        ,
        0.97100336, 0.        , 0.        , 0.85723251, 0.        ,
        0.        , 0.        , 0.97476173, 0.        ],
       [0.71327923, 0.        , 1.        , 0.44526567, 0.87204717,
        0.        , 0.84824326, 0.47106228, 0.        , 0.75236451,
        0.46160344, 0.44010176, 0.        , 0.23589885],
       [0.46582992, 0.        , 0.44526567, 1.        , 0.5121746 ,
        0.        , 0.52111199, 0.20798888, 0.        , 0.52752004,
        0.13621618, 0.2986673 , 0.        , 0.13155158],
       [0.79011252, 0.        , 0.87204717, 0.5121746 , 1.        ,
        0.        , 0.92876541, 0.48454158, 0.        , 0.85014359,
        0.40850011, 0.46012939, 0.        , 0.26388157],
       [0.        , 0.97100336, 0. 

### Product Popularity 

In [54]:
popular_recommendations

Unnamed: 0,category,num_ratings,avg_ratings
3,Car Electronics & GPS,48,4.645833
0,Accessories & Supplies,168,4.583333
9,Portable Audio & Video,329,4.43769
4,Computers & Accessories,589,4.429542
2,Camera & Photo,422,4.386256


## User Similarity 

In [55]:
def recommend_us(category):
    out = []
    index = np.where(pt.index==category)[0][0]
    similar_items = sorted(list(enumerate(similarity_scores[index])),key=lambda x:x[1],reverse=True)[1:6]
    
    for i in similar_items:
        out.append(pt.index[i[0]])
        
    return out

In [56]:
np.where(pt.index=='Wearable Technology')[0][0]

13

In [57]:
sorted(list(enumerate(similarity_scores[0])),key=lambda x:x[1],reverse=True)[1:6]

[(6, 0.7943057081434057),
 (4, 0.790112520697153),
 (9, 0.7194589946483296),
 (2, 0.7132792291426516),
 (3, 0.4658299202788485)]

In [65]:
recommend_us('Bottoms')

['Tops', 'Dresses', 'Outerwear', 'Accessories & Supplies', 'Camera & Photo']

### Past Preferences

In [59]:
def recommend_pp(category, user_id):
    category_index = np.where(pt.index == category)[0][0]
    
    similar_items = sorted(list(enumerate(similarity_scores[category_index])), key=lambda x: x[1], reverse=True)[1:6]
    
    recommended_items = []
    
    for item_index, similarity_score in similar_items:
        rating = pt.iloc[item_index, user_id]
        if rating > 0:
            recommended_items.append((item_index, similarity_score, rating))
    
    recommended_items.sort(key=lambda x: (x[2], x[1]), reverse=True)
    
    recommended_item_names = [pt.index[item_index] for item_index, _, _ in recommended_items]
    
    return recommended_item_names


In [68]:
recommend_pp('Bottoms',201)

['Tops', 'Dresses']

###  Getting the rankings from these 3 factors (Product Popularity, User Similarity and Past Preferences)

In [61]:
def  Recommendranking(pf_list,recomend_list,top_list):
    dic={}
    # out=[None]*len(recomend_list)
    for i in range(len(recomend_list)):
        for j in range(len(pf_list)):
            if pf_list[j]==recomend_list[i]:
                dic[recomend_list[i]]=j
                break
    out=sorted(dic)
    # for i in range(len(recomend_list)):
    #     if dic.get(recomend_list[i])!=None:
    #         out[dic[recomend_list[i]]]=recomend_list[i]
    dic2={}
    # max_value=max(dic.values())
    for i in range(len(recomend_list)):
        if dic.get(recomend_list[i])==None:
            for j in range(len(top_list)):
                if top_list[j]==recomend_list[i]:
                    dic2[recomend_list[i]]= j
                    break
    out1=sorted(dic2)
    return out+out1

In [70]:
Recommendranking(recommend_pp('Bottoms',201),recommend_us('Bottoms'),tp_df)

['Dresses', 'Tops', 'Accessories & Supplies', 'Camera & Photo']

In [71]:
recommend_pp('Headphones',51)

['Portable Audio & Video',
 'Accessories & Supplies',
 'Camera & Photo',
 'Computers & Accessories']

In [72]:
recommend_us('Headphones')

['Computers & Accessories',
 'Portable Audio & Video',
 'Camera & Photo',
 'Accessories & Supplies',
 'Car Electronics & GPS']

In [74]:
Recommendranking(recommend_pp('Headphones',51),recommend_us('Headphones'),tp_df)

['Accessories & Supplies',
 'Camera & Photo',
 'Computers & Accessories',
 'Portable Audio & Video']

In [75]:
import pickle
pickle.dump(popular_df,open('popular.pkl','wb'))

In [76]:
pickle.dump(pt,open('pt.pkl','wb'))
pickle.dump(similarity_scores,open('similarity_scores.pkl','wb'))