In [114]:
import pandas as pd
import numpy as np
from random import choices
from lightfm import LightFM
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings("ignore")

# Loading Data

In [181]:
user_subset = pd.read_excel("../data/finerydata.xlsx", sheet_name = "7. user_subset")
item_subset = pd.read_excel("../data/finerydata.xlsx", sheet_name = "8. item_subset")
wishlist = pd.read_excel("../data/finerydata.xlsx", sheet_name = "12. wishlist_items")
influencers = pd.read_excel("../data/finerydata.xlsx", sheet_name = "13. user_influencer")
users_set_100 = pd.read_excel("../data/finerydata.xlsx", sheet_name = "14. 100_users_set")
item_set_100 = pd.read_excel("../data/finerydata.xlsx", sheet_name = "15. 100_users_item_set")
influencers_100 = pd.read_excel("../data/finerydata.xlsx", sheet_name = "16. 100_user_influencer")
category = pd.read_excel("../data/finerydata.xlsx", sheet_name = "9. category_ids")
brand_affinity = pd.read_excel("../data/finerydata.xlsx", sheet_name = "10. brands_affinity")
reference_color = pd.read_excel("../data/finerydata.xlsx", sheet_name = "2. reference_color")
occasion = pd.read_excel("../data/finerydata.xlsx", sheet_name = "4. sample_occasion")
color_rule = pd.read_excel("../data/finerydata.xlsx", sheet_name = "5. influencer_color_rules", header = None)
cat_subcat = pd.read_excel("../data/finerydata.xlsx", sheet_name = "6. category_and_subcategory", header = None)

# Combining the Data

In [182]:
print("User Subset: " + str(user_subset.shape))
print("Item Subset: " + str(item_subset.shape))
print("Wishlist: " + str(wishlist.shape))
print("Influencers: " + str(influencers.shape))
print("100 User Subset: " + str(users_set_100.shape))
print("100 Item Subset: " + str(item_set_100.shape))
print("100 Influencers: " + str(influencers_100.shape))
print("Category ID: " + str(category.shape))
print("Brand Affinity: " + str(brand_affinity.shape))
print("Reference Color: " + str(reference_color.shape))
print("Occasion: " + str(occasion.shape))
print("Influencer Color Rules: " + str(color_rule.shape))
print("Category and Subcategory: " + str(cat_subcat.shape))

User Subset: (20, 96)
Item Subset: (6221, 62)
Wishlist: (164, 31)
Influencers: (15, 2)
100 User Subset: (100, 96)
100 Item Subset: (8371, 62)
100 Influencers: (73, 2)
Category ID: (78, 2)
Brand Affinity: (31, 7)
Reference Color: (657, 7)
Occasion: (12, 1)
Influencer Color Rules: (81, 23)
Category and Subcategory: (56, 10)


In [172]:
brand_affinity1 = brand_affinity.iloc[1:,:3]
brand_affinity1.columns = ['brand_1', 'brand_2', 'score']
brand_affinity2 = brand_affinity.iloc[1:,4:]
brand_affinity2.columns = ['brand_1', 'brand_2', 'score']
brand_aff = brand_affinity1.append(brand_affinity2, ignore_index=True)
brand_aff.shape

(60, 3)

In [142]:
all_user = user_subset.append(users_set_100, ignore_index=True, sort=False)
all_user.shape

(120, 96)

In [143]:
all_influencer = influencers.append(influencers_100, ignore_index=True, sort=False)
all_influencer.shape

(88, 2)

In [144]:
all_item = item_subset.append(item_set_100, ignore_index=True, sort=False)
all_item['wishlist'] = 0
wishlist['wishlist'] = 1
all_item.shape

(14592, 63)

In [145]:
all_item['product_id'] = all_item['product_id'].fillna(all_item['item_id'])

In [146]:
wishlist.rename(columns={'userid': 'user_id', 
                         'itemid': 'product_id', 
                         'itemname': 'item_name',
                         'brandname_lower': 'brand_id',
                         'store_lower': 'store_id',
                         'itemname_lower': 'item_name_lower', 
                         'itemcategory': 'product_category_id',
                         'deleted': 'is_deleted',
                         'price': 'paid_price', 
                         'origprice': 'list_price'
                        }, inplace=True)

In [147]:
item_wishlist = all_item.append(wishlist, ignore_index=True, sort=False)
item_wishlist.shape

(14756, 81)

In [148]:
category = category.dropna(how='all') 
category.shape

(67, 2)

In [149]:
item_wishlist = item_wishlist.merge(category, how = "left", left_on = "product_category_id", right_on = 'Category ID')
item_wishlist.rename(columns={'Category Name': 'category_name', 'Category ID': 'category_id'}, inplace = True)
item_wishlist.shape

(14756, 83)

In [189]:
cat_subcat = cat_subcat.iloc[:,:1].dropna(how='all')
cat_subcat.head(10)

Unnamed: 0,0
0,Shoes
1,"Heels: Work, Formal, Night Life, Cocktail, Int..."
2,"Sneakers: School, Errands, Concert, Movies, Wo..."
3,"Sandals: Beach, School, Errands, Brunch, Movie..."
4,"Boots: Work, School, Errands, Dinner, Concert,..."
5,"Flats: Beach, Work, School, Errands, Dinner, C..."
7,Tops
8,"Blouses: Work, School, Dinner, Concert, Night ..."
9,"T-shirts: Beach, Work, School, Errands, Dinner..."
10,"Tanks: Beach, School, Errands, Concert, Night ..."


# User Data

In [151]:
all_user.head()

Unnamed: 0,user_id,first_name,last_name,address_zip,gender,signup_date,date_last_login,visited_wishlist,visited_shopping,finished_approval,...,context_device_model,context_device_type,context_network_wifi,device_advertising_id_blocked,context_network_cellular,context_network_carrier,context_os_version,locale_us,context_locale,style_push_permission
0,SBmFVmiyziheGcUXF1506291233259,NYCDSA,NYCDSA,,female,2017-09-24 17:13:53,2017-09-24 17:13:54,False,True,False,...,,,,,,,,,,False
1,DHvdjwEmqSXpYONGd1514587641548,NYCDSA,NYCDSA,,female,2017-12-29 17:47:21,2017-12-29 18:38:27,True,True,False,...,,,,,,,,,,False
2,rboNfQfmMjQrAzLAX1531423951169,NYCDSA,NYCDSA,,other,2018-07-12 14:32:31,2018-07-12 14:32:31,False,False,False,...,"iPhone9,2",iPhone9,1.0,1.0,0.0,Verizon,11.4,1.0,en-US,True
3,RZsOYGrUlkXWxYGSi1532103589490,NYCDSA,NYCDSA,,other,2018-07-20 11:19:49,2018-07-20 11:19:49,False,False,False,...,"iPhone10,5",iPhone10,1.0,0.0,0.0,AT&T,11.4.1,1.0,en-US,True
4,ZsFZyqyShAPBMwkLu1534274912488,NYCDSA,NYCDSA,,female,2018-08-14 14:28:32,2018-08-14 14:28:32,False,False,False,...,"iPhone8,1",iPhone8,1.0,0.0,0.0,AT&T,11.3,1.0,en-US,False


In [152]:
all_user.columns

Index(['user_id', 'first_name', 'last_name', 'address_zip', 'gender',
       'signup_date', 'date_last_login', 'visited_wishlist',
       'visited_shopping', 'finished_approval', 'has_shared', 'safari_ext',
       'chrome_ext', 'firefox_ext', 'registered', 'push_enabled',
       'subscribed_onboarding_emails', 'subscribed_onsale_alerts',
       'subscribed_return_reminders', 'subscribed_finery_updates',
       'subscribed_misc_emails', 'subscribed_no_emails',
       'wishlist_notifications', 'timezone', 'set_avatar', 'style_age_range',
       'style_age_range_group', 'items_in_wardrobe', 'items_in_wishlist',
       'location_latitude', 'location_longitude', 'notifications_enabled_ios',
       'style_brands_selected', 'style_size_preference_none',
       'style_size_preference_petite', 'style_size_preference_extra_long',
       'style_size_preference_plus', 'style_size_preference_maternity',
       'style_size_preference_skipped', 'email_provided',
       'notifications_enabled_desktop'

In [153]:
col_names = ['user_id', 'style_age_range_group', 'style_vibe', 
            'style_most_important_active', 'style_most_important_any',
            'style_most_important_beach', 'style_most_important_dress',
            'style_most_important_bags', 'style_most_important_jeans',
            'style_most_important_jump', 'style_most_important_nothing',
            'style_most_important_outwear', 'style_most_important_pants',
            'style_most_important_shoes', 'style_most_important_tops',
            'style_shopping_pref_gaps',
            'style_shopping_pref_other', 'style_shopping_pref_trips',
            'style_shopping_pref_events', 'style_shopping_pref_work',
            'style_shopping_pref_wish', 'style_shopping_pref_organize',
            'style_shopping_pref_inspo']
subset_user = all_user[col_names]
subset_user.sample(10)

Unnamed: 0,user_id,style_age_range_group,style_vibe,style_most_important_active,style_most_important_any,style_most_important_beach,style_most_important_dress,style_most_important_bags,style_most_important_jeans,style_most_important_jump,...,style_most_important_shoes,style_most_important_tops,style_shopping_pref_gaps,style_shopping_pref_other,style_shopping_pref_trips,style_shopping_pref_events,style_shopping_pref_work,style_shopping_pref_wish,style_shopping_pref_organize,style_shopping_pref_inspo
60,kApFwAhYdSINjyXfo1540229647916,2.0,Columbia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
92,dQfXMvWJyvUHVutfH1539659776338,3.0,Brentwood,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
81,tWbOJaDvCdPEyoKZQ1504447154564,,,,,,,,,,...,,,,,,,,,,
78,VjTpazlgFXyRQOHfR1533238049769,0.0,Lake Buena Vista,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
34,MCqqMgVnKRhkrvlQf1538703977615,0.0,San Antonio,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
73,AnTAJILkrweOQEOxt1539568378197,0.0,Paterson,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
25,lTbOURAMnUHxzFLDe1524543123801,,,,,,,,,,...,,,,,,,,,,
91,nFwMokjyOpgInoLMG1529282181454,2.0,Orlando,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
97,EqntCbpopoKSFtPnw1528762526242,1.0,Dover,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
99,HXhDceGvGpoNlIsqu1538503539686,0.0,Buffalo,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [154]:
subset_user['style_vibe'] = subset_user['style_vibe'].str.split(",").str[0].str.lower()
subset_user["style_age_range_group"] = subset_user["style_age_range_group"].apply(lambda x: choices([0, 1, 2], [0.4, 0.3, 0.3])[0] if np.isnan(x) else x)
subset_user["style_vibe"] = subset_user["style_vibe"].fillna(choices(subset_user['style_vibe'].unique())[0])

col_names = subset_user.filter(like='style_shopping').columns
col_names = col_names.append(subset_user.filter(like='style_most_important').columns)
subset_user[col_names] = subset_user[col_names].fillna(choices([0,1])[0])

In [155]:
temp = subset_user.isnull().sum().sort_values(ascending = False)
temp[temp != 0]

Series([], dtype: int64)

In [156]:
subset_user.sample(10)

Unnamed: 0,user_id,style_age_range_group,style_vibe,style_most_important_active,style_most_important_any,style_most_important_beach,style_most_important_dress,style_most_important_bags,style_most_important_jeans,style_most_important_jump,...,style_most_important_shoes,style_most_important_tops,style_shopping_pref_gaps,style_shopping_pref_other,style_shopping_pref_trips,style_shopping_pref_events,style_shopping_pref_work,style_shopping_pref_wish,style_shopping_pref_organize,style_shopping_pref_inspo
48,IuHNuWCWAWmBNCnIf1550516903728,1.0,rockland,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
97,EqntCbpopoKSFtPnw1528762526242,1.0,dover,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
69,ekPVMkLjkwczXDFKi1548523614882,2.0,miami,0.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0
40,EepjfcrrjDCVkSKLV1543110769237,0.0,huntington beach,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
83,OZlgrfjTOrvkISNOG1543923705097,2.0,buffalo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
103,OQRzBejakqdnqxhKt1546970903292,0.0,winchester,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38,fRFEOMmCdksyXgfVr1516818075393,0.0,buffalo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70,uDlpcEvrISMoLvJKh1517962134688,0.0,buffalo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,PPVeikiBAXzIeTrto1519233344185,0.0,buffalo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84,XMKykzmfckGaIThFs1515379240540,1.0,buffalo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Item Data

In [157]:
item_wishlist.head()

Unnamed: 0,_id,item_id,user_id,brand_id,user_provided_brand_name,parsed_brand_name,store_id,user_provided_store_name,parsed_store_name,product_id,...,sharelinkshort,sale,saleprice,saleurl,share_item_id,saledate,collections,stylesdate,category_name,category_id
0,209561,58d5c57b6b21659a9bf9afc7,ekteZyevBdOMLFZds1490404459978,loft,,Loft,loft,,loft,58d5239899563e3bf98e7033,...,,,,,,,,,Jackets,152.0
1,209593,58d5c59f6b21659a9bf9affc,ekteZyevBdOMLFZds1490404459978,,,,nordstrom,,Nordstrom,58d5c59f7c8a96c9a044f19d,...,,,,,,,,,Miscellaneous,600.0
2,209657,58d5c54d6b21659a9bf9afa2,ekteZyevBdOMLFZds1490404459978,,,,nordstrom,,Nordstrom,58d5c54d5e4fefcabb201c3c,...,,,,,,,,,Miscellaneous,600.0
3,209881,58d5c6bc6b21659a9bf9b102,ekteZyevBdOMLFZds1490404459978,,,,sephora,,Sephora,58d56e9ec9f4693c00b913e1,...,,,,,,,,,Beauty,500.0
4,209913,58d5c7736b21659a9bf9b144,ekteZyevBdOMLFZds1490404459978,fabletics,,Fabletics,fabletics,,Fabletics,58d13a07b55a7764b1a607d4,...,,,,,,,,,Activewear,160.0


In [158]:
item_wishlist.columns

Index(['_id', 'item_id', 'user_id', 'brand_id', 'user_provided_brand_name',
       'parsed_brand_name', 'store_id', 'user_provided_store_name',
       'parsed_store_name', 'product_id', 'closet_id', 'is_deleted',
       'delete_reason', 'item_name', 'item_name_lower', 'indix_category',
       'product_category_id', 'order_number', 'currency', 'paid_price',
       'list_price', 'sale_price', 'price_range', 'price_filter',
       'order_total_amt', 'order_discount', 'shipping_total',
       'parsed_sale_price', 'size', 'is_vintage', 'is_favorite', 'email_id',
       'added_ts', 'email_ts', 'email_dt', 'sort_ts', 'approval_status',
       'approval_status_dt', 'delete_ts', 'is_for_sale', 'color_parsed',
       'is_returned', 'returned_ts', 'season_id', 'source', 'is_public',
       'sharelink_short', 'is_visible', 'return_deadline_dt', 'has_duplicates',
       'is_mozenda_item', 'indix_added_ts', 'indix_start_ts', 'donation_descr',
       'donation_dt', 'is_price_adjusted', 'is_preprocess

In [159]:
col_names = ['user_id', 'product_id', 'brand_id', 'store_id', 'is_deleted', 'item_name_lower', 
             'category_name', 'category_id', 'paid_price', 'color_parsed', 'wishlist']
subset_item = item_wishlist[col_names]
subset_item['brand_id'] = subset_item['brand_id'].str.replace(' ', '')
subset_item['brand_id'] = subset_item['brand_id'].str.replace('-', '')
subset_item['brand_id'] = subset_item['brand_id'].str.replace('&', '')
subset_item['store_id'] = subset_item['store_id'].str.replace(' ', '')
subset_item['store_id'] = subset_item['store_id'].str.replace('-', '')
subset_item['store_id'] = subset_item['store_id'].str.replace('&', '')
subset_item.sample(10)

Unnamed: 0,user_id,product_id,brand_id,store_id,is_deleted,item_name_lower,category_name,category_id,paid_price,color_parsed,wishlist
10611,hktGDcSbmApUzRhGi1495117820151,5a14a1700f952e002ab4e65f,,lucy,False,yoga girl tunic,Kids,610.0,0.0,,0
8014,rnGHYPxaUoXNNFgah1530899668174,595e6d318de2820049c6b7c8,athleticpropulsionlabs,saksfifthavenue,False,techloom phantom mesh sneakers,Sneakers,250.0,165.0,BLUSH-CREAM,0
10314,yZIcDJCRfeunHavTa1541215258673,gurMnASmRPSnjyQiM1542953051589,,walmart,False,3 year protection plan for electronics 8099 99,Kids,610.0,11.0,,0
11361,nFwMokjyOpgInoLMG1529282181454,HXLVCQEgbmxhrvWCW1526660549018,jcrewfactory,jcrewfactory,False,suede slides with pompoms navy fuchsia,Sandals,230.0,29.5,,0
9572,EeacyONVQCDWGfZxW1538675291370,MekXhbdrxmoojeGbN1538885317942,,,False,pale blue and lace singlet,Tops,110.0,0.0,,0
8974,mGVOSHGmoezYAFgmK1537259845783,5ac68c7b40382b002a3e0c64,,nordstrom,False,lush raw edge side slit tee,T Shirts,112.0,15.9,white,0
3145,KMHJBRMJIkrCcUcrA1539389830936,5a2c3770105f5c002a3c623b,victoriassecret,victoriassecret,False,bracelet and fragrance set,Miscellaneous,600.0,0.0,Rollerball And Bangle (099),0
13460,riHLmJzrhQHEOOOol1540519065537,NBNAYOufCeawnPtLE1540519297434,jcrewfactory,jcrewfactory,False,factory 4 chino short old red,Shorts,122.0,19.5,,0
9315,mGVOSHGmoezYAFgmK1537259845783,5ac9631ae44e39002a156116,,nordstrom,False,est e lauder renutriv ultimate lift regenerati...,Kids,610.0,0.0,no color,0
7438,fRFEOMmCdksyXgfVr1516818075393,5a68d060d900a1002a4c0670,puma,hautelook,False,low cut sock pack of 6,Miscellaneous,600.0,8.0,,0


In [160]:
print(subset_item.shape)
temp = subset_item.isnull().sum().sort_values(ascending = False)
temp[temp != 0]

(14756, 11)


color_parsed       6898
brand_id           5302
category_id        2106
category_name      2106
store_id            514
item_name_lower     154
paid_price          141
dtype: int64

# User Number of Item

In [82]:
subset_item["has_product"] = [0 if deleted == True else 1 for deleted in subset_item["is_deleted"]]
user_product = subset_item.groupby(["user_id", "product_id"])[["has_product"]].sum().reset_index()
user_product.sort_values(by = "has_product", ascending = False).head(10)

Unnamed: 0,user_id,product_id,has_product
12538,uDlpcEvrISMoLvJKh1517962134688,5a7f0678313a42002a11b600,12
8991,mGVOSHGmoezYAFgmK1537259845783,5a4eb9c3705b39002a5292a9,9
3736,YEHlhimKVpTnlltGw1514483609267,5abfbd7a4891e2002a81a23e,9
10773,nFwMokjyOpgInoLMG1529282181454,civwfgMjKafCWPRNf1541925435319,9
8843,mGVOSHGmoezYAFgmK1537259845783,59421bc64b76b30090ee0923,8
3995,ZLiGJwNkCDnQBMSQY1497978280616,586bc18f729f834bbecc0392,7
8672,kpuiZwKNzWruIrCea1531873923973,5aaff4609c5d63002a4ec016,7
9070,mGVOSHGmoezYAFgmK1537259845783,5aaa56c9158fd1002a25061e,7
10702,nFwMokjyOpgInoLMG1529282181454,IJCCuqOkkGgFftmNR1541843547485,6
3783,YEHlhimKVpTnlltGw1514483609267,fAJDvrAYjZrPOwZpO1532339843431,6


# User-Influencer Matrix

In [84]:
influencers = ['weworewhat', 'chiaraferragni', 'blaireadiebee', 'somethingnavy', 'hannahbronfman',
               'nicolettemason', 'manrepeller', 'jordynwoods', 'seaofshoes', 'ariellecharnas']

In [85]:
user_influencer = pd.DataFrame(data=0, index=all_user.user_id, columns=influencers)

for row in range(all_influencer.shape[0]):
    user = all_influencer['user_id'][row]
    influ = all_influencer['style_who_inspiries'][row]
    for influencer in influencers:
        if influ.find(influencer) != -1:
            user_influencer[influencer][user] += 1 

user_influencer.head(10)

Unnamed: 0_level_0,weworewhat,chiaraferragni,blaireadiebee,somethingnavy,hannahbronfman,nicolettemason,manrepeller,jordynwoods,seaofshoes,ariellecharnas
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
SBmFVmiyziheGcUXF1506291233259,0,0,0,0,0,0,0,0,0,0
DHvdjwEmqSXpYONGd1514587641548,0,0,0,0,0,0,0,0,0,0
rboNfQfmMjQrAzLAX1531423951169,0,1,1,1,0,0,0,0,0,0
RZsOYGrUlkXWxYGSi1532103589490,0,1,0,0,0,1,0,0,0,0
ZsFZyqyShAPBMwkLu1534274912488,1,1,1,1,0,0,0,0,1,0
wPSJznugsLxGTVzSt1535641042977,1,0,0,0,1,1,0,0,0,0
KMHJBRMJIkrCcUcrA1539389830936,0,1,0,0,0,0,0,0,0,0
zaUVbGzoQMyxwhxYe1532180138666,0,0,0,1,0,0,0,0,0,0
rZGWJDRvvWjBLKfGF1532437479456,1,0,0,0,0,0,0,0,0,0
bqnoAjxnAbwhntJuU1534200663687,1,0,0,0,0,0,0,1,0,0


In [86]:
user_user_byinfluencer = pd.DataFrame(data=cosine_similarity(user_influencer), 
                         index=all_user.user_id, columns=all_user.user_id)
user_user_byinfluencer.head(10)

user_id,SBmFVmiyziheGcUXF1506291233259,DHvdjwEmqSXpYONGd1514587641548,rboNfQfmMjQrAzLAX1531423951169,RZsOYGrUlkXWxYGSi1532103589490,ZsFZyqyShAPBMwkLu1534274912488,wPSJznugsLxGTVzSt1535641042977,KMHJBRMJIkrCcUcrA1539389830936,zaUVbGzoQMyxwhxYe1532180138666,rZGWJDRvvWjBLKfGF1532437479456,bqnoAjxnAbwhntJuU1534200663687,...,kIVuVbYAijKgchjBU1547080163262,IxwWDxsICtdFQonXO1538002122412,yDRpRuRkrutKtmZoH1531428926861,jFJkZZkzfcgdpYaWz1546915104530,bgfAtVdSyWtCCasub1535124717025,riTOVCwwBFvAjaBrX1504480921172,iDDoOGOZnwaAKmzpF1531966340209,aDkFJvBQABdodOhRe1485369350003,IUIwmBWjpuwFTuILv1541366448241,fcxfBirJmkApcCtCb1539277708892
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SBmFVmiyziheGcUXF1506291233259,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DHvdjwEmqSXpYONGd1514587641548,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
rboNfQfmMjQrAzLAX1531423951169,0.0,0.0,1.0,0.408248,0.774597,0.0,0.57735,0.57735,0.0,0.0,...,0.0,0.57735,0.57735,0.0,0.288675,0.0,0.0,0.0,0.471405,0.774597
RZsOYGrUlkXWxYGSi1532103589490,0.0,0.0,0.408248,1.0,0.316228,0.408248,0.707107,0.0,0.0,0.0,...,0.0,0.707107,0.0,0.0,0.707107,0.0,0.0,0.0,0.57735,0.632456
ZsFZyqyShAPBMwkLu1534274912488,0.0,0.0,0.774597,0.316228,1.0,0.258199,0.447214,0.447214,0.447214,0.316228,...,0.258199,0.447214,0.447214,0.0,0.447214,0.0,0.447214,0.0,0.547723,0.6
wPSJznugsLxGTVzSt1535641042977,0.0,0.0,0.0,0.408248,0.258199,1.0,0.0,0.0,0.57735,0.408248,...,0.333333,0.0,0.0,0.57735,0.288675,0.0,0.57735,0.0,0.707107,0.258199
KMHJBRMJIkrCcUcrA1539389830936,0.0,0.0,0.57735,0.707107,0.447214,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.5,0.0,0.0,0.0,0.408248,0.447214
zaUVbGzoQMyxwhxYe1532180138666,0.0,0.0,0.57735,0.0,0.447214,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.447214
rZGWJDRvvWjBLKfGF1532437479456,0.0,0.0,0.0,0.0,0.447214,0.57735,0.0,0.0,1.0,0.707107,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.408248,0.0
bqnoAjxnAbwhntJuU1534200663687,0.0,0.0,0.0,0.0,0.316228,0.408248,0.0,0.0,0.707107,1.0,...,0.0,0.0,0.0,0.0,0.353553,0.0,0.707107,0.0,0.57735,0.0


# Collaborative Filtering Model

In [101]:
import turicreate

ModuleNotFoundError: No module named 'turicreate'

In [90]:
user_product_matrix = user_product.pivot(index='user_id', columns='product_id', values='has_product')
print(user_product_matrix.shape)
user_product_matrix.sample(10)

(102, 13815)


product_id,585dfe712025900f7bf9b208,585dfe872025900f7bf9b423,585e3f7bf964040f7ca803ee,585e400b7bc3400f7eabf596,585e40257bc3400f7eabf756,585e402c7bc3400f7eabf777,585e57247bc3400f7eac2bce,585e6b0c4c0f100f75cd4947,585e6b50f964040f7ca84b85,585e6c52f964040f7ca8619d,...,zspDMRKVQpOgioDqe1528762750201,ztWJubYXsWwkTSOoy1538899220203,zwVmffrEcpApJzWOc1539757883825,zwVsAswExcRunQUGZ1528656933085,zwltgAQjUQkxKkQBd1540004033561,zxWJeAvipxLgHpbsq1530473551544,zyQFIWnriEdviqyfr1533737714982,zzMmAqKWqSRlMKEbU1533761869194,zzUHFhCNBrZKVdYBG1525876112406,zzxgLZEWzoPBJCdNO1545211406502
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
EqntCbpopoKSFtPnw1528762526242,,,,,,,,,,,...,1.0,,,,,,,,,
TBNIjHeYCmBfqivUF1530193158929,,,,,,,,,,,...,,,,,,,,,,
zpYxFnSEGEOUlTMAf1534260940406,,,,,,,,,,,...,,,,,,,,,,
tHhIUZVexXFTxILWn1538337292724,,,,,,,,,,,...,,,,,,,,,,
rnGHYPxaUoXNNFgah1530899668174,,,,,,,,,,,...,,,,,,,,,,
iDDoOGOZnwaAKmzpF1531966340209,,,,,,,,,,,...,,,,,,,,,,
yDRpRuRkrutKtmZoH1531428926861,,,,,,,,,,,...,,,,,,,,,,
rboNfQfmMjQrAzLAX1531423951169,,,,,,,,,,,...,,,,,,,,,,
ZcXluwdEXFlwqAXQK1527387850533,,,,,,,,,,,...,,,,,,,,,,
SXzhYRlUGUGYBivLx1534359866087,,,,,,,,,,,...,,,,,,,,,,


In [91]:
from sklearn.metrics.pairwise import pairwise_distances 
user_similarity = pairwise_distances(user_product_matrix, metric='cosine')
item_similarity = pairwise_distances(user_product_matrix.T, metric='cosine')

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

# Old Stuffs

In [None]:
def similarity_matrix(df, col_similar, variable):
    subset = df[[col_similar, variable]]
    subset = pd.get_dummies(subset, columns=[variable]).set_index(col_similar)
    return pd.DataFrame(data=cosine_similarity(subset), index=df[col_similar], columns=df[col_similar])

In [None]:
all_user["style_age_range_group"] = all_user["style_age_range_group"].apply(lambda x: choices([0, 1, 2], [0.4, 0.3, 0.3])[0] if np.isnan(x) else x)
user_user_byage = similarity_matrix(all_user, "user_id", "style_age_range_group")
user_user_byage.head(10)

In [None]:
print("Shape user-user matrix by influencer: " + str(user_user_byinfluencer.shape))
print("Shape user-user matrix by age: " + str(user_user_byage.shape))

In [None]:
user_user_byinfluencer.dot(user_user_byage)

In [None]:
all_item['is_deleted'].value_counts()

In [None]:
all_item["has_product"] = [0 if deleted == True else 1 for deleted in all_item["is_deleted"]]

In [None]:
user_product_matrix = all_item.groupby(["user_id", "product_id"])[["has_product"]].sum().reset_index()
max_value = user_product_matrix["has_product"].max()
min_value = user_product_matrix["has_product"].min()
user_product_matrix = user_product_matrix.pivot(index='user_id', columns='product_id', values='has_product')
user_product_matrix.head(10)

In [None]:
user_product_matrix = (user_product_matrix-min_value)/(max_value-min_value)
user_product_matrix.head(10)

In [None]:
all_data = item_wishlist.merge(user_influencer, how = "left", on = "user_id")
all_data.shape