# Objective 2: Recommendation system using Collaborative Filtering

### Approaches:

    - Cosine Similarity
    - Matrix Factorization

### Loading of Dataset

In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('updated_reviews_5000ea.csv')
df

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,predicted_sentiment
0,US,1797882.0,R3I2DHQBR577SS,B001ANOOOE,2102612.0,The Naked Bee Vitmin C Moisturizing Sunscreen ...,Beauty,5.0,0.0,0.0,N,Y,Five Stars,love excel sun block,2015-08-31,5
1,US,18381298.0,R1QNE9NQFJC2Y4,B0016J22EQ,106393691.0,"Alba Botanica Sunless Tanning Lotion, 4 Ounce",Beauty,5.0,0.0,0.0,N,Y,Thank you Alba Bontanica!,great thing cream doesnt smell weird like chem...,2015-08-31,5
2,US,19242472.0,R3LIDG2Q4LJBAO,B00HU6UQAG,375449471.0,"Elysee Infusion Skin Therapy Elixir, 2oz.",Beauty,5.0,0.0,0.0,N,Y,Five Stars,great product im year old claim,2015-08-31,5
3,US,19551372.0,R3KSZHPAEVPEAL,B002HWS7RM,255651889.0,"Diane D722 Color, Perm And Conditioner Process...",Beauty,5.0,0.0,0.0,N,Y,GOOD DEAL!,use shower cap condit cap like theyr bulk save...,2015-08-31,4
4,US,14802407.0,RAI2OIG50KZ43,B00SM99KWU,116158747.0,Biore UV Aqua Rich Watery Essence SPF50+/PA+++...,Beauty,5.0,0.0,0.0,N,Y,this soaks in quick and provides a nice base f...,goto daili sunblock leav white cast clean plea...,2015-08-31,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,US,6985401.0,RLWQNDS7I46ST,B00TDJ5VTO,959170259.0,"Waterproof Bluetooth Speaker, Keedox Wireless ...",Mobile_Electronics,5.0,0.0,0.0,N,Y,cool,work greatbr get wet reason tobr gal water proof,2015-06-14,4
29996,US,127390.0,RIDE7LEW92C1U,B00S015EMK,231869191.0,"Game Day w/ GPLC & Pure PF3, Blue Bomb-Sicle 250g",Health & Personal Care,1.0,0.0,0.0,N,Y,One Star,old batch gameday color tast differ ive bought...,2015-08-31,3
29997,US,13150882.0,R2M3HM87H61ETP,B007A1XG5I,730919193.0,New Chapter Every Woman's One Daily 40 Plus Bo...,Health & Personal Care,5.0,3.0,4.0,N,Y,Amazon Prime is the best and most cost effecti...,take vitamin year make differ mental physicall...,2015-08-31,5
29998,US,4722303.0,RW8FGV1OTGXMK,B005NJLN86,70378667.0,Massachusetts Engineers Arch and Logo Short Sl...,Sports,5.0,0.0,0.0,N,Y,Five Stars,good cotton flawlessli sewb togeth good job,2015-08-31,4


### EDA on the full dataset

In [2]:
legend_pred_segm = df.predicted_sentiment.value_counts().sort_values(ascending=True)
legend_pred_segm = legend_pred_segm.reset_index()
legend_pred_segm.rename(columns = {'index':'Legend of Predicted Sentiment', 'index':'Legend of Predicted Sentiment'}, inplace = True)
legend_pred_segm

Unnamed: 0,Legend of Predicted Sentiment,predicted_sentiment
0,2,2561
1,1,2818
2,3,4582
3,4,6819
4,5,13220


In [3]:
df.helpful_votes.value_counts().reset_index()

Unnamed: 0,index,helpful_votes
0,0.0,22491
1,1.0,4017
2,2.0,1286
3,3.0,617
4,4.0,379
...,...,...
92,164.0,1
93,288.0,1
94,78.0,1
95,97.0,1


In [4]:
df.star_rating.value_counts().reset_index()

Unnamed: 0,index,star_rating
0,5.0,18832
1,4.0,4179
2,1.0,3246
3,3.0,2231
4,2.0,1512


In [200]:
df.dtypes

marketplace             object
customer_id            float64
review_id               object
product_id              object
product_parent         float64
product_title           object
product_category        object
star_rating            float64
helpful_votes          float64
total_votes            float64
vine                    object
verified_purchase       object
review_headline         object
review_body             object
review_date             object
predicted_sentiment      int64
dtype: object

## Addition of average rating column

In [2]:
df['avg_rating'] = df.predicted_sentiment.copy()
df

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,predicted_sentiment,avg_rating
0,US,1797882.0,R3I2DHQBR577SS,B001ANOOOE,2102612.0,The Naked Bee Vitmin C Moisturizing Sunscreen ...,Beauty,5.0,0.0,0.0,N,Y,Five Stars,love excel sun block,2015-08-31,5,5
1,US,18381298.0,R1QNE9NQFJC2Y4,B0016J22EQ,106393691.0,"Alba Botanica Sunless Tanning Lotion, 4 Ounce",Beauty,5.0,0.0,0.0,N,Y,Thank you Alba Bontanica!,great thing cream doesnt smell weird like chem...,2015-08-31,5,5
2,US,19242472.0,R3LIDG2Q4LJBAO,B00HU6UQAG,375449471.0,"Elysee Infusion Skin Therapy Elixir, 2oz.",Beauty,5.0,0.0,0.0,N,Y,Five Stars,great product im year old claim,2015-08-31,5,5
3,US,19551372.0,R3KSZHPAEVPEAL,B002HWS7RM,255651889.0,"Diane D722 Color, Perm And Conditioner Process...",Beauty,5.0,0.0,0.0,N,Y,GOOD DEAL!,use shower cap condit cap like theyr bulk save...,2015-08-31,4,4
4,US,14802407.0,RAI2OIG50KZ43,B00SM99KWU,116158747.0,Biore UV Aqua Rich Watery Essence SPF50+/PA+++...,Beauty,5.0,0.0,0.0,N,Y,this soaks in quick and provides a nice base f...,goto daili sunblock leav white cast clean plea...,2015-08-31,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,US,6985401.0,RLWQNDS7I46ST,B00TDJ5VTO,959170259.0,"Waterproof Bluetooth Speaker, Keedox Wireless ...",Mobile_Electronics,5.0,0.0,0.0,N,Y,cool,work greatbr get wet reason tobr gal water proof,2015-06-14,4,4
29996,US,127390.0,RIDE7LEW92C1U,B00S015EMK,231869191.0,"Game Day w/ GPLC & Pure PF3, Blue Bomb-Sicle 250g",Health & Personal Care,1.0,0.0,0.0,N,Y,One Star,old batch gameday color tast differ ive bought...,2015-08-31,3,3
29997,US,13150882.0,R2M3HM87H61ETP,B007A1XG5I,730919193.0,New Chapter Every Woman's One Daily 40 Plus Bo...,Health & Personal Care,5.0,3.0,4.0,N,Y,Amazon Prime is the best and most cost effecti...,take vitamin year make differ mental physicall...,2015-08-31,5,5
29998,US,4722303.0,RW8FGV1OTGXMK,B005NJLN86,70378667.0,Massachusetts Engineers Arch and Logo Short Sl...,Sports,5.0,0.0,0.0,N,Y,Five Stars,good cotton flawlessli sewb togeth good job,2015-08-31,4,4


In [3]:
# check for all helpful votes == 0, 
    # replace the avg_rating col with the avg of predicted_sentiment and star_rating
for i in range(len(df)):
    if (df.iloc[i, 8] == 0.0):
        df.iloc[i, 16] = (df.iloc[i, 15] + df.iloc[i, 7]) / 2 
df

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,predicted_sentiment,avg_rating
0,US,1797882.0,R3I2DHQBR577SS,B001ANOOOE,2102612.0,The Naked Bee Vitmin C Moisturizing Sunscreen ...,Beauty,5.0,0.0,0.0,N,Y,Five Stars,love excel sun block,2015-08-31,5,5.0
1,US,18381298.0,R1QNE9NQFJC2Y4,B0016J22EQ,106393691.0,"Alba Botanica Sunless Tanning Lotion, 4 Ounce",Beauty,5.0,0.0,0.0,N,Y,Thank you Alba Bontanica!,great thing cream doesnt smell weird like chem...,2015-08-31,5,5.0
2,US,19242472.0,R3LIDG2Q4LJBAO,B00HU6UQAG,375449471.0,"Elysee Infusion Skin Therapy Elixir, 2oz.",Beauty,5.0,0.0,0.0,N,Y,Five Stars,great product im year old claim,2015-08-31,5,5.0
3,US,19551372.0,R3KSZHPAEVPEAL,B002HWS7RM,255651889.0,"Diane D722 Color, Perm And Conditioner Process...",Beauty,5.0,0.0,0.0,N,Y,GOOD DEAL!,use shower cap condit cap like theyr bulk save...,2015-08-31,4,4.5
4,US,14802407.0,RAI2OIG50KZ43,B00SM99KWU,116158747.0,Biore UV Aqua Rich Watery Essence SPF50+/PA+++...,Beauty,5.0,0.0,0.0,N,Y,this soaks in quick and provides a nice base f...,goto daili sunblock leav white cast clean plea...,2015-08-31,5,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,US,6985401.0,RLWQNDS7I46ST,B00TDJ5VTO,959170259.0,"Waterproof Bluetooth Speaker, Keedox Wireless ...",Mobile_Electronics,5.0,0.0,0.0,N,Y,cool,work greatbr get wet reason tobr gal water proof,2015-06-14,4,4.5
29996,US,127390.0,RIDE7LEW92C1U,B00S015EMK,231869191.0,"Game Day w/ GPLC & Pure PF3, Blue Bomb-Sicle 250g",Health & Personal Care,1.0,0.0,0.0,N,Y,One Star,old batch gameday color tast differ ive bought...,2015-08-31,3,2.0
29997,US,13150882.0,R2M3HM87H61ETP,B007A1XG5I,730919193.0,New Chapter Every Woman's One Daily 40 Plus Bo...,Health & Personal Care,5.0,3.0,4.0,N,Y,Amazon Prime is the best and most cost effecti...,take vitamin year make differ mental physicall...,2015-08-31,5,5.0
29998,US,4722303.0,RW8FGV1OTGXMK,B005NJLN86,70378667.0,Massachusetts Engineers Arch and Logo Short Sl...,Sports,5.0,0.0,0.0,N,Y,Five Stars,good cotton flawlessli sewb togeth good job,2015-08-31,4,4.5


In [7]:
# df.to_csv('amazon_dataset_w_avgRating.csv')
df.to_csv('amazon_dataset_w_avgRating_v2.csv')

### Finding best performing category

In [16]:
# getting the mean score of the predicted sentiment score across all unique categories
df2 = df.groupby(['product_category'])['avg_rating'].mean().reset_index()
df2 = df2.sort_values(by=['avg_rating'], ascending=False)
df2.reset_index(drop=True)

Unnamed: 0,product_category,avg_rating
0,Beauty,4.131
1,Sports,4.1126
2,Health & Personal Care,4.0716
3,Pet Products,4.0175
4,Mobile_Electronics,3.7291
5,Major Appliances,3.6741


From above, we can see that the 'Personal_Care_Appliances' has the highest predicted sentiment score. Overall this field has performed good and scored highest in customer contentment level

In [9]:
df2 = df.groupby(['product_category', 'product_id'])['avg_rating'].max().reset_index(name='max_rating')
df2

Unnamed: 0,product_category,product_id,max_rating
0,Beauty,604113452X,5.0
1,Beauty,9510213861,5.0
2,Beauty,9780123458,3.0
3,Beauty,9790790961,5.0
4,Beauty,B000050B75,3.5
...,...,...,...
21210,Sports,B014B94BEQ,5.0
21211,Sports,B014G7RG9K,4.0
21212,Sports,B014GB81AY,5.0
21213,Sports,B014PX90WQ,5.0


### Finding best performing products in each category

In [207]:
df[df['product_id'] == 'B000052YOB']

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,predicted_sentiment,avg_rating
2106,US,21945309.0,R1RLSJV0L7Y7IY,B000052YOB,112605937.0,"Mederma Skin Care for Scars, 1.76 oz",Beauty,4.0,0.0,0.0,N,Y,this seems to work better than anything else I...,far seem work better anyth els tri sever ach scar,2015-08-31,2,3.0


In [10]:
for category in df2.product_category.unique():
    temp_df2 = df2[df2['product_category']==category]
    highest_rating = temp_df2.max_rating.max()
    temp_df2 = temp_df2[temp_df2['max_rating']==highest_rating]
    temp_df2 = temp_df2.reset_index(drop=True)
    print(temp_df2)

     product_category  product_id  max_rating
0              Beauty  604113452X         5.0
1              Beauty  9510213861         5.0
2              Beauty  9790790961         5.0
3              Beauty  B000052WYL         5.0
4              Beauty  B000052YBV         5.0
...               ...         ...         ...
1980           Beauty  B013T7MEZO         5.0
1981           Beauty  B0140EEAX4         5.0
1982           Beauty  B0141IK4U2         5.0
1983           Beauty  B0146LZ0XU         5.0
1984           Beauty  B0147R9UMA         5.0

[1985 rows x 3 columns]
            product_category  product_id  max_rating
0     Health & Personal Care  614847932X         5.0
1     Health & Personal Care  B00005LOGX         5.0
2     Health & Personal Care  B00005RL5E         5.0
3     Health & Personal Care  B00006AFDC         5.0
4     Health & Personal Care  B00008MOQA         5.0
...                      ...         ...         ...
1757  Health & Personal Care  B013TH8LJC         5.0

### Analytical Result

total best performing product across all 6 chosen categories = 1985 + 1762 + 827 + 800 + 1830 + 2007 = 9211

best performing out of total unique products = 9211/21215

In [11]:
df3 = df.copy()
df3 = df3.groupby(['customer_id'])['product_id'].count().reset_index(name='count of purchased products')
df3

Unnamed: 0,customer_id,count of purchased products
0,10182.0,1
1,12598.0,2
2,16511.0,1
3,20968.0,1
4,21087.0,3
...,...,...
24386,53089047.0,1
24387,53090280.0,1
24388,53090974.0,1
24389,53094442.0,1


In [31]:
df3[df3['count of purchased products']>1].count()

customer_id                    715
count of purchased products    715
dtype: int64

Only 715 out of 10935 have purchased more than 1 product.

In [32]:
df3[df3['count of purchased products']>1]

Unnamed: 0,customer_id,count of purchased products
3,21087.0,2
26,106879.0,3
38,109750.0,3
42,110642.0,2
52,113520.0,2
...,...,...
10853,52813756.0,2
10861,52830468.0,2
10875,52856050.0,3
10878,52884970.0,5


In [35]:
df[df['customer_id']==21087.0]

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,predicted_sentiment,avg_rating
3340,US,21087.0,RDLUND3YZGY27,B007EI1II4,919220526.0,Zoo Med Repticare Day Night Timer,Pet Products,5.0,0.0,0.0,N,Y,Five Star,bought work great thank,31/8/15,5,5.0
9283,US,21087.0,R1EXRNE8I7H3TH,B004NYWR7S,105653985.0,KLEANCOLOR Techno High - Metallic Nail Lacquer...,Beauty,5.0,0.0,0.0,N,Y,Five Star,birthday gift cute set nail polish bottl small...,31/8/15,5,5.0


## Product Recommendation System - Cosine similarity score

### sports category

In [4]:
df_sports = df[df['product_category']=='Sports'].reset_index(drop=True)
df_sports

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,predicted_sentiment,avg_rating
0,US,48945260.0,R1WBPB8MDCCN8F,B012P7UPSM,409940130.0,Chicago Blackhawks Adult Cuff Knit Beanie w/ P...,Sports,5.0,0.0,0.0,N,N,LOVE IT. 6 stars!,bought last winter love im femal hat comfi fit...,2015-08-31,5,5.0
1,US,5782091.0,R32M0YEWV77XG8,B001GQ3VHG,657746679.0,Copag Poker Size Regular Index 1546 Playing Ca...,Sports,5.0,1.0,1.0,N,Y,Shipped fast.,best plastic play card ive ever own came anoth...,2015-08-31,3,3.0
2,US,45813853.0,RR8V7WR27NXJ5,B008VS8M58,962442336.0,Baoer 223 5.56x45mm Caliber Cartridge Laser Bo...,Sports,1.0,0.0,0.0,N,Y,Good idea if it would work.,look good fit finish realli good doesnt work t...,2015-08-31,1,1.0
3,US,1593730.0,R1MHO5V9Z932AY,B005F06F4U,74305227.0,All Terrain Tackle Jig - Grass Master - June B...,Sports,5.0,0.0,0.0,N,Y,Five Stars,great jig,2015-08-31,5,5.0
4,US,29605511.0,R16PD71086BD2V,B010T4IE2C,787185588.0,"Swim Cap - 3 Pack (Blue, Black & Red)",Sports,5.0,0.0,1.0,N,N,Great quality silicon and very comfortable,love swim pool treat blond hair get realli gre...,2015-08-31,5,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,US,33499699.0,R2GFNOR34VJX2R,B00PCIYC1I,784203524.0,RDX MMA Shin Foam Pads Support Boxing Leg Guar...,Sports,4.0,0.0,0.0,N,Y,It's coverage was better than adidas with strapt,light protect coverag better adida strapt onki...,2015-08-31,3,3.5
4996,US,536936.0,R262YI26ABHLM6,B00NZAWABO,336234039.0,Madjax 04-009 Front Clays Steel Basket for Clu...,Sports,4.0,0.0,0.0,N,Y,Four Stars,basket look good great hold blind bag shotgun,2015-08-31,5,4.5
4997,US,35462469.0,R1J894D08ETEXV,B00PDU1EWU,116499873.0,Licensed NFL Dallas Cowboys Team Logo Glitter ...,Sports,3.0,0.0,0.0,N,Y,chain fades,chain fade coupl day,2015-08-31,3,3.0
4998,US,4722303.0,RW8FGV1OTGXMK,B005NJLN86,70378667.0,Massachusetts Engineers Arch and Logo Short Sl...,Sports,5.0,0.0,0.0,N,Y,Five Stars,good cotton flawlessli sewb togeth good job,2015-08-31,4,4.5


### Generation of dataframes for other categories

In [5]:
df_beauty = df[df['product_category']=='Beauty'].reset_index(drop=True)
df_health = df[df['product_category']=='Health & Personal Care'].reset_index(drop=True)
df_pet = df[df['product_category']=='Pet Products'].reset_index(drop=True)
df_mobile = df[df['product_category']=='Mobile_Electronics'].reset_index(drop=True)
df_majorapp = df[df['product_category']=='Major Appliances'].reset_index(drop=True)

### EDA on the df_sports dataset before feeding into the recommendation system

In [6]:
df_sports['product_id'].nunique()

4642

In [14]:
matrx = df_sports.groupby('customer_id')['product_id'].apply(lambda x: list(np.unique(x)))
matrx

customer_id
12598.0                               [B0094K7MM6]
20968.0                               [B00H1RC3RS]
26310.0                               [B00Q9DCK5K]
35356.0                               [B00BTJIMXI]
41717.0       [B0041FVR9I, B007SQ0LT4, B00BNTZMCS]
                              ...                 
53049514.0                            [B00SFIFY7S]
53053627.0                            [B00B4NK8FO]
53065854.0                            [B004TGWUPE]
53087868.0                            [B00JQHWT52]
53094442.0                            [B013AP49XU]
Name: product_id, Length: 4109, dtype: object

In [7]:
# product_avg_rating = df_sports.groupby(['customer_id', 'product_id'], as_index=False)['avg_rating'].prod()
# product_avg_rating

### checking for repeated purchases

In [8]:
check =[]
repeated_purchase = []
for i in range(df_sports.shape[0]):
    if df_sports.iloc[i,3] not in check:
        check.append(df_sports.iloc[i,3])
    else:
        repeated_purchase.append(df_sports.iloc[i,3])
print(len(repeated_purchase))

358


In [9]:
repeated_purchase

['B00SM93ECC',
 'B00T53O4AK',
 'B00SSLDP6E',
 'B00BIZPXWQ',
 'B00JWXTV7E',
 'B00W9D9UAW',
 'B00BMFEXJQ',
 'B00EE79EZM',
 'B000PW64JY',
 'B002E6RERU',
 'B0017LEUCU',
 'B00UYMA8NI',
 'B00J5LS02U',
 'B001GQ3VHG',
 'B0011N6I7O',
 'B00GLX4KZ6',
 'B004S7PPMO',
 'B00K0MR2YU',
 'B011CCGRX8',
 'B000UVVX28',
 'B00GN72Q2Y',
 'B00HA2ZKFQ',
 'B007O5B0LC',
 'B012FICZRK',
 '7245456313',
 'B00ZIRFVGS',
 'B000JJL8VW',
 'B007SXKVY2',
 'B00HMN7LDC',
 'B00WR30A08',
 'B00WRD9N7Y',
 'B013HPGHLK',
 'B004NKSPR8',
 'B00E3VH3DO',
 'B00092CKN4',
 'B00ASJQGDS',
 'B012FICZRK',
 'B012G6ZO60',
 'B012FICZRK',
 'B00K5503O8',
 'B001O0DITA',
 'B00065X222',
 'B00SJE8H1I',
 'B00X691L7I',
 'B003TJH3LI',
 'B00R9M2WJ4',
 'B005BINV84',
 'B00KAEJ3V8',
 'B00CICGI40',
 'B00NG1ECBS',
 'B00DH7DRG2',
 'B00XASHSNQ',
 'B00GLX4KZ6',
 'B007V5CCY4',
 'B00KQSVRF8',
 'B003TJH3LI',
 'B010MRIPTS',
 'B00DGQQC6Q',
 'B00FX0S4DC',
 'B002HR7K3O',
 'B003RXFHBO',
 'B004ERKCIA',
 'B00HPQPXVI',
 'B012FICZRK',
 'B00L5HKN54',
 'B012BFEC9Q',
 'B00WJESQ

### Creation of Recommendation matrix 'recMat'

In [10]:
columns = ['customer_id'] + list(df_sports.product_id.unique())
recMat = pd.DataFrame(columns=columns)
recMat

Unnamed: 0,customer_id,B012P7UPSM,B001GQ3VHG,B008VS8M58,B005F06F4U,B010T4IE2C,B004RKJGLS,B005V3DCBU,B00MHT9WN8,B001CSIRQ8,...,B001EYDN2I,B00DTVCGAE,B003D6F5HQ,B001QFADXC,B00OI66KGU,B00PCIYC1I,B00NZAWABO,B00PDU1EWU,B005NJLN86,B01068GCB6


### Steps to create recommendation matrix
    1. Loop through the list of unique customers ids, 
    2. Create a temporary dataframes for each unique user, 
    3. Append the customer_id to the recMat when the length of temporary dataframes > 2 
       This means that the unique customer has repeated purchases)
    4. Also add the 'avg_rating' score in the cell under each product_id

In [11]:
for i in range(len(list(df_sports.customer_id.unique()))):
    df_temp = df_sports[df_sports.customer_id == list(df_sports.customer_id.unique())[i]]
    df_temp = df_temp.reset_index(drop=True)
    if len(df_temp) >= 2:
        print(df_temp)
        addedRow = len(recMat)
        recMat.loc[addedRow] = [0 for i in range(recMat.shape[1])]
        for j in range(len(df_temp)):
            recMat.iloc[addedRow, 0] = df_temp.loc[j,'customer_id']
            recMat.iloc[addedRow, columns.index(df_temp.loc[j, 'product_id'])] = df_temp.loc[j, 'avg_rating']

  marketplace  customer_id       review_id  product_id  product_parent  \
0          US    5782091.0  R32M0YEWV77XG8  B001GQ3VHG     657746679.0   
1          US    5782091.0  R3DUOOVW36I0X1  B00D2CSUHS     392989246.0   
2          US    5782091.0  R1447W45DF2MM6  B00Q3I0KZI     164926538.0   
3          US    5782091.0  R24BK3U0S1WAJE  B0098JQCOW     595377176.0   
4          US    5782091.0  R2CZN5OUFSFG2F  B0001W0E8A     416001899.0   
5          US    5782091.0  R34XRN8M438TOT  B00D8HFWOG     590836218.0   
6          US    5782091.0   RJNF13U2DXO6F  B004135WRS     846903481.0   

                                       product_title product_category  \
0  Copag Poker Size Regular Index 1546 Playing Ca...           Sports   
1  Robert Baraban Chrome Spring Chrome Handle Gri...           Sports   
2             25 Kg Black Gym Hand Grip Strengthener           Sports   
3  Panther Slingshot Powerful Outdoor Hunting Sli...           Sports   
4                 Trumark Slingshots Ammo 

In [12]:
recMat

Unnamed: 0,customer_id,B012P7UPSM,B001GQ3VHG,B008VS8M58,B005F06F4U,B010T4IE2C,B004RKJGLS,B005V3DCBU,B00MHT9WN8,B001CSIRQ8,...,B001EYDN2I,B00DTVCGAE,B003D6F5HQ,B001QFADXC,B00OI66KGU,B00PCIYC1I,B00NZAWABO,B00PDU1EWU,B005NJLN86,B01068GCB6
0,5782091,0,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0,0,0
1,29605511,0,0,0,0,5,0,0,0,0,...,0,0,0,0,0,0,0.0,0,0,0
2,13981540,0,0,0,0,0,0,0,5,0,...,0,0,0,0,0,0,0.0,0,0,0
3,26040213,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0,0,0
4,27138575,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
535,32741689,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0,0,0
536,50636148,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0,0,0
537,15031175,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0,0,0
538,16028747,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0,0,0


In [13]:
len(df_sports) - df_sports.customer_id.nunique()

891

In [14]:
from sklearn.metrics.pairwise import pairwise_distances 
user_similarity = pairwise_distances(recMat, metric='cosine')
print(user_similarity)
item_similarity = pairwise_distances(recMat.T, metric='cosine')
print(item_similarity)

[[0.00000000e+00 2.13729034e-12 2.23665531e-12 ... 2.15572005e-12
  2.20601315e-12 2.12552198e-12]
 [2.13729034e-12 0.00000000e+00 1.56430424e-13 ... 7.54951657e-14
  1.25788269e-13 4.52970994e-14]
 [2.23665531e-12 1.56430424e-13 0.00000000e+00 ... 1.74860126e-13
  2.25153229e-13 1.44662060e-13]
 ...
 [2.15572005e-12 7.54951657e-14 1.74860126e-13 ... 0.00000000e+00
  1.44217971e-13 6.37268016e-14]
 [2.20601315e-12 1.25788269e-13 2.25153229e-13 ... 1.44217971e-13
  0.00000000e+00 1.14019905e-13]
 [2.12552198e-12 4.52970994e-14 1.44662060e-13 ... 6.37268016e-14
  1.14019905e-13 1.11022302e-16]]
[[4.44089210e-16 1.00000000e+00 9.91656761e-01 ... 1.00000000e+00
  1.00000000e+00 1.00000000e+00]
 [1.00000000e+00 1.00000000e+00 1.00000000e+00 ... 1.00000000e+00
  1.00000000e+00 1.00000000e+00]
 [9.91656761e-01 1.00000000e+00 0.00000000e+00 ... 1.00000000e+00
  1.00000000e+00 1.00000000e+00]
 ...
 [1.00000000e+00 1.00000000e+00 1.00000000e+00 ... 1.00000000e+00
  1.00000000e+00 1.00000000e+00]

In [15]:
print(np.array_str(user_similarity, precision=4))

[[0.0000e+00 2.1373e-12 2.2367e-12 ... 2.1557e-12 2.2060e-12 2.1255e-12]
 [2.1373e-12 0.0000e+00 1.5643e-13 ... 7.5495e-14 1.2579e-13 4.5297e-14]
 [2.2367e-12 1.5643e-13 0.0000e+00 ... 1.7486e-13 2.2515e-13 1.4466e-13]
 ...
 [2.1557e-12 7.5495e-14 1.7486e-13 ... 0.0000e+00 1.4422e-13 6.3727e-14]
 [2.2060e-12 1.2579e-13 2.2515e-13 ... 1.4422e-13 0.0000e+00 1.1402e-13]
 [2.1255e-12 4.5297e-14 1.4466e-13 ... 6.3727e-14 1.1402e-13 1.1102e-16]]


In [77]:
### testing
k=3
userIdx = 500
np.argpartition(user_similarity[:, userIdx], k)[:k]
# np.argpartition(user_similarity[:, userIdx], len(user_similarity[:, userIdx])-k)[-k:]

array([528, 500,  76])

In [71]:
def getUserProd(userIdx, kNearestUsers, minRating):
    userprod = recMat.iloc[userIdx, :].sort_values(ascending=False)
    userprod = pd.DataFrame(userprod)
    userprod = userprod.iloc[1:, :]
    userprod = userprod[userprod[userIdx] > 0]
    
    closestUserIdxes = np.argpartition(user_similarity[:, userIdx], kNearestUsers)[:kNearestUsers]
    dfOut = pd.DataFrame()
    for closestUserIdx in closestUserIdxes:
        closestUserprod = recMat.iloc[closestUserIdx, :].sort_values(ascending=False)
        closestUserprod = pd.DataFrame(closestUserprod)
        closestUserprod = closestUserprod.iloc[1:, :]
        closestUserprod = closestUserprod[closestUserprod[closestUserIdx] > 0]
        closestUserprod = closestUserprod.rename(columns={closestUserIdx: 'Rating'})
        dfOut = dfOut.append(closestUserprod)
        
    dfOut = dfOut[dfOut['Rating']>=minRating]
    dfOut = dfOut[~dfOut.index.isin(userprod[userIdx].to_list())]
    return dfOut

In [84]:
getUserProd(20, 3, 1)

  dfOut = dfOut.append(closestUserprod)
  dfOut = dfOut.append(closestUserprod)
  dfOut = dfOut.append(closestUserprod)


Unnamed: 0,Rating
B002HWNVZK,5.0
B000LEF8BG,4.5
B001287JIA,4.5
B00ANSGLUC,4.5
7245456313,4.5
B00SNBG6Y2,4.0
B008JA2Y0W,3.5
B00345XX5W,3.0
B00YOUPCT6,2.0
B009LA8ZDY,2.0


## Product Recommendation System - Matrix Factorization
### using full sized data with all categories

In [90]:
import pandas as pd
# df=pd.read_csv('amazon_dataset_w_avgRating2.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,predicted_sentiment,avg_rating
0,US,1797882.0,R3I2DHQBR577SS,B001ANOOOE,2102612.0,The Naked Bee Vitmin C Moisturizing Sunscreen ...,Beauty,5.0,0.0,0.0,N,Y,Five Stars,love excel sun block,2015-08-31,5,5.0
1,US,18381298.0,R1QNE9NQFJC2Y4,B0016J22EQ,106393691.0,"Alba Botanica Sunless Tanning Lotion, 4 Ounce",Beauty,5.0,0.0,0.0,N,Y,Thank you Alba Bontanica!,great thing cream doesnt smell weird like chem...,2015-08-31,5,5.0
2,US,19242472.0,R3LIDG2Q4LJBAO,B00HU6UQAG,375449471.0,"Elysee Infusion Skin Therapy Elixir, 2oz.",Beauty,5.0,0.0,0.0,N,Y,Five Stars,great product im year old claim,2015-08-31,5,5.0
3,US,19551372.0,R3KSZHPAEVPEAL,B002HWS7RM,255651889.0,"Diane D722 Color, Perm And Conditioner Process...",Beauty,5.0,0.0,0.0,N,Y,GOOD DEAL!,use shower cap condit cap like theyr bulk save...,2015-08-31,4,4.5
4,US,14802407.0,RAI2OIG50KZ43,B00SM99KWU,116158747.0,Biore UV Aqua Rich Watery Essence SPF50+/PA+++...,Beauty,5.0,0.0,0.0,N,Y,this soaks in quick and provides a nice base f...,goto daili sunblock leav white cast clean plea...,2015-08-31,5,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,US,6985401.0,RLWQNDS7I46ST,B00TDJ5VTO,959170259.0,"Waterproof Bluetooth Speaker, Keedox Wireless ...",Mobile_Electronics,5.0,0.0,0.0,N,Y,cool,work greatbr get wet reason tobr gal water proof,2015-06-14,4,4.5
29996,US,127390.0,RIDE7LEW92C1U,B00S015EMK,231869191.0,"Game Day w/ GPLC & Pure PF3, Blue Bomb-Sicle 250g",Health & Personal Care,1.0,0.0,0.0,N,Y,One Star,old batch gameday color tast differ ive bought...,2015-08-31,3,2.0
29997,US,13150882.0,R2M3HM87H61ETP,B007A1XG5I,730919193.0,New Chapter Every Woman's One Daily 40 Plus Bo...,Health & Personal Care,5.0,3.0,4.0,N,Y,Amazon Prime is the best and most cost effecti...,take vitamin year make differ mental physicall...,2015-08-31,5,5.0
29998,US,4722303.0,RW8FGV1OTGXMK,B005NJLN86,70378667.0,Massachusetts Engineers Arch and Logo Short Sl...,Sports,5.0,0.0,0.0,N,Y,Five Stars,good cotton flawlessli sewb togeth good job,2015-08-31,4,4.5


In [91]:
# df3 = df[['customer_id','product_id','avg_rating']]
# df3.rename(columns = {'avg_rating':'Amazingmeter'}, inplace = True)

In [92]:
df5 = df[['customer_id','product_id','avg_rating']]
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(df5, test_size=0.2)

#resetting indices to avoid indexing errors in the future
train_df = train_df.reset_index()[['customer_id', 'product_id', 'avg_rating']]
valid_df = valid_df.reset_index()[['customer_id', 'product_id', 'avg_rating']]

In [93]:
#We need continuous IDs to be able to index into the embedding matrix and access each user/item embedding.
import numpy as np

def encode_column(column):
    """ Encodes a pandas column with continous IDs"""
    keys = column.unique()
    key_to_id = {key:idx for idx,key in enumerate(keys)}
    return key_to_id, np.array([key_to_id[x] for x in column]), len(keys)

def encode_df(anime_df):
    """Encodes rating data with continuous user and anime ids"""
    
    customer_ids, anime_df['customer_id'], num_customer  = encode_column(anime_df['customer_id'])
    product_ids, anime_df['product_id'], num_product = encode_column(anime_df['product_id'])
    return anime_df, num_customer, num_product, customer_ids, product_ids

emb_df, num_customer, num_product, customer_ids, product_ids = encode_df(df5)
print("Number of product :", num_customer)
print("Number of customer :", num_product)
emb_df.head()

Number of product : 24391
Number of customer : 21209


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_ids, anime_df['customer_id'], num_customer  = encode_column(anime_df['customer_id'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  product_ids, anime_df['product_id'], num_product = encode_column(anime_df['product_id'])


Unnamed: 0,customer_id,product_id,avg_rating
0,0,0,5.0
1,1,1,5.0
2,2,2,5.0
3,3,3,4.5
4,4,4,5.0


### initialize item and user embeddings, create sparse utility matrix

In [94]:
from scipy import sparse
from scipy.sparse import csc_matrix

def create_embeddings(n, K):
    """
    Creates a random numpy matrix of shape n, K with uniform values in (0, 11/K)
    n: number of items/users
    K: number of factors in the embedding 
    """
    return 11*np.random.random((n, K)) / K

def create_sparse_matrix(df, rows, cols, column_name="avg_rating"):
    """ Returns a sparse utility matrix""" 
    return sparse.csc_matrix((df[column_name].values,(df['customer_id'].values, df['product_id'].values)),shape=(rows, cols))
   
Y = create_sparse_matrix(emb_df, num_customer, num_product)
# to view matrix
Y.todense()

matrix([[5. , 0. , 0. , ..., 0. , 0. , 0. ],
        [0. , 5. , 0. , ..., 0. , 0. , 0. ],
        [0. , 0. , 5. , ..., 0. , 0. , 0. ],
        ...,
        [0. , 0. , 0. , ..., 5. , 0. , 0. ],
        [0. , 0. , 0. , ..., 0. , 4.5, 0. ],
        [0. , 0. , 0. , ..., 0. , 0. , 4. ]])

### prediction using product and consumer embeddings

In [95]:
def predict(df, emb_user, emb_anime):
    """ This function computes df["prediction"] without doing (U*V^T).
    
    Computes df["prediction"] by using elementwise multiplication of the corresponding embeddings and then 
    sum to get the prediction u_i*v_j. This avoids creating the dense matrix U*V^T.
    """
    df['prediction'] = np.sum(np.multiply(emb_anime[df['product_id']],emb_user[df['customer_id']]), axis=1)
    return df

lmbda=0.0002

def cost(df, emb_user, emb_anime):
    """ Computes mean square error"""
    Y = create_sparse_matrix(df, emb_user.shape[0], emb_anime.shape[0])
    predicted = create_sparse_matrix(predict(df, emb_user, emb_anime), emb_user.shape[0], emb_anime.shape[0], 'prediction')
    return np.sum((Y-predicted).power(2))/df.shape[0] 

def gradient(df, emb_user, emb_anime):
    """ Computes the gradient for user and anime embeddings"""
    Y = create_sparse_matrix(df, emb_user.shape[0], emb_anime.shape[0])
    predicted = create_sparse_matrix(predict(df, emb_user, emb_anime), emb_user.shape[0], emb_anime.shape[0], 'prediction')
    delta =(Y-predicted)
    grad_user = (-2/df.shape[0])*(delta*emb_anime) + 2*lmbda*emb_user
    grad_anime = (-2/df.shape[0])*(delta.T*emb_user) + 2*lmbda*emb_anime
    return grad_user, grad_anime

def gradient_descent(df, emb_user, emb_anime, iterations=2000, learning_rate=0.01, df_val=None):
    """ 
    Computes gradient descent with momentum (0.9) for given number of iterations.
    emb_user: the trained user embedding
    emb_anime: the trained anime embedding
    """
    Y = create_sparse_matrix(df, emb_user.shape[0], emb_anime.shape[0])
    beta = 0.9
    grad_user, grad_anime = gradient(df, emb_user, emb_anime)
    v_user = grad_user
    v_anime = grad_anime
    for i in range(iterations):
        grad_user, grad_anime = gradient(df, emb_user, emb_anime)
        v_user = beta*v_user + (1-beta)*grad_user
        v_anime = beta*v_anime + (1-beta)*grad_anime
        emb_user = emb_user - learning_rate*v_user
        emb_anime = emb_anime - learning_rate*v_anime
        if(not (i+1)%50):
            print("\niteration", i+1, ":")
            print("train mse:",  cost(df, emb_user, emb_anime))
            if df_val is not None:
                print("validation mse:",  cost(df_val, emb_user, emb_anime))
    return emb_user, emb_anime


In [96]:
emb_consumer = create_embeddings(num_customer, 3)
emb_product = create_embeddings(num_product, 3)
emb_consumer, emb_product = gradient_descent(emb_df, emb_consumer, emb_product, iterations=800, learning_rate=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['prediction'] = np.sum(np.multiply(emb_anime[df['product_id']],emb_user[df['customer_id']]), axis=1)



iteration 50 :
train mse: 37.451953279776795

iteration 100 :
train mse: 25.013524507403933

iteration 150 :
train mse: 17.531243194902032

iteration 200 :
train mse: 12.6759232414639

iteration 250 :
train mse: 9.389004607255055

iteration 300 :
train mse: 7.102488988833598

iteration 350 :
train mse: 5.483073242167422

iteration 400 :
train mse: 4.32323212412958

iteration 450 :
train mse: 3.488056881191735

iteration 500 :
train mse: 2.886882624249143

iteration 550 :
train mse: 2.4571387157287234

iteration 600 :
train mse: 2.154691954301085

iteration 650 :
train mse: 1.9478407486731637

iteration 700 :
train mse: 1.813457596907495

iteration 750 :
train mse: 1.7344436158422705

iteration 800 :
train mse: 1.6980097902012574


In [98]:
def encode_new_data(valid_df, user_ids, anime_ids):
    """ Encodes valid_df with the same encoding as train_df.
    """
    df_val_chosen = valid_df['product_id'].isin(product_ids.keys()) & valid_df['customer_id'].isin(customer_ids.keys())
    valid_df = valid_df[df_val_chosen]
    valid_df['product_id'] =  np.array([product_ids[x] for x in valid_df['product_id']])
    valid_df['customer_id'] = np.array([customer_ids[x] for x in valid_df['customer_id']])
    return valid_df

print("before encoding:", valid_df.shape)
valid_df = encode_new_data(valid_df, customer_ids, product_ids)
print("after encoding:", valid_df.shape)

before encoding: (6000, 3)
after encoding: (6000, 3)


In [99]:
valid_df.dtypes

customer_id      int64
product_id       int64
avg_rating     float64
dtype: object

In [100]:
train_mse = cost(encode_new_data(train_df,customer_ids, product_ids), emb_consumer, emb_product)
val_mse = cost(valid_df, emb_consumer, emb_product)
print('train_mse:', train_mse)
print('val_mse:', val_mse)

train_mse: 1.687134469809592
val_mse: 1.7415110717679207


In [102]:
valid_df.head()


Unnamed: 0,customer_id,product_id,avg_rating,prediction
0,21372,17417,5.0,4.900846
1,7025,4336,4.0,2.458602
2,24337,21159,4.5,4.02256
3,5584,5189,4.0,3.782374
4,8888,5401,5.0,3.727618
