### The purpose of this notebook is to build collaborative filtering method for recommendations

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
from datetime import datetime, timedelta
import collections

from util_transaction_data import *

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 200)


In [4]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.preprocessing import *
from mlxtend.frequent_patterns import association_rules
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import fpmax

### first, prepare the data
 - will remove the "system account", i.e., merchant 191 and 11, also remove cigarette catgories
 - we will use "top_cat" as items
 - we will use the category names instead of category IDs
 - need to download the category name data file from https://drive.google.com/file/d/1PmuZ9XsTWWi4tA_8_qaqSFscYVZPX7LJ/view?usp=share_link
 

In [14]:
filename = './AwanTunai_transaction_data.csv'
baskets = pd.read_csv(filename)

baskets = baskets[baskets.merchant_id !=191] # this is a system account
baskets = baskets[baskets.merchant_id !=11] # this is a system account
baskets = baskets[baskets.top_cat !=27] # remove cgarette top categories
baskets = baskets[baskets.sub_cat !=86] # remove cigarette sub categories

baskets.dropna(inplace=True)

cat_mapping_file = './top_category_mapping_new.csv'
top_cat_names = pd.read_csv(cat_mapping_file)
df = baskets.merge(top_cat_names[["top_cat_id","top_cat_en"]], left_on="top_cat",right_on="top_cat_id")
baskets = df.drop(axis=1, columns=["top_cat_id"])

In [15]:
transaction_data = []
for groups in baskets.groupby('order_id').groups.values():
    transaction_data.append(list(baskets.loc[groups]['top_cat_en'].to_numpy()))
transaction_data[:10]

[['general medicine', 'instant food', 'instant food'],
 ['general medicine', 'powder drink'],
 ['sweetend condensed milk', 'packaged drink'],
 ['general medicine', 'instant food'],
 ['instant food'],
 ['general medicine'],
 ['general medicine', 'powder drink', 'packaged drink'],
 ['general medicine', 'powder drink'],
 ['sweetend condensed milk'],
 ['powder drink']]

In [16]:
def oneHotCoding(transaction_data):
    te = TransactionEncoder()
    te_ary = te.fit(transaction_data).transform(transaction_data)
    te_df = pd.DataFrame(te_ary, columns=te.columns_)
    return te_df

transaction_oneHot = oneHotCoding(transaction_data)

In [17]:
transaction_oneHot

Unnamed: 0,baby care,baby food,bath soap,body care,bottled water,breakfast food,canned food,carpentry tools,cream soap,dental care,dish soap,external medicine,floor washing soap,food materials,gas lighter,general medicine,hair care,household general supplis,household hygiene supplies,ingredients,instant food,laundry soap,medical supplies,packaged drink,packaged tea,playing cards,powder drink,sachet drink,snack,stationary,sweetend condensed milk,syrup,toys,womens hygiene supplies
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23955,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,True,False,False,False,True,False,False,False,False,True,False,False,False,False,False
23956,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,True,False,True,False,False,False,False,False
23957,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,True,False,True,True,True,False,False,False,False,False
23958,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False


In [18]:
transaction_oneHot.sum()

baby care                      1771
baby food                       493
bath soap                      3102
body care                      2926
bottled water                   119
breakfast food                   99
canned food                      31
carpentry tools                 168
cream soap                      182
dental care                    2260
dish soap                       400
external medicine              2195
floor washing soap              797
food materials                 1075
gas lighter                       8
general medicine               9489
hair care                      2848
household general supplis      2947
household hygiene supplies     3152
ingredients                   10910
instant food                   9294
laundry soap                   4798
medical supplies                374
packaged drink                 9104
packaged tea                   1917
playing cards                     8
powder drink                   7869
sachet drink                

### build user-item interaction matrix
 - using qty as degree of "like", and take sum 
 - fillin NA with zero for not purchased before

In [23]:
merchant_cat = baskets.pivot_table(values='qty', index=["merchant_id"], columns="top_cat_en", aggfunc=np.sum).reset_index()
merchant_cat.set_index(keys="merchant_id",inplace=True)
merchant_cat.fillna(value=0,inplace = True)
merchant_cat.head(10)

top_cat_en,baby care,baby food,bath soap,body care,bottled water,breakfast food,canned food,carpentry tools,cream soap,dental care,dish soap,external medicine,floor washing soap,food materials,gas lighter,general medicine,hair care,household general supplis,household hygiene supplies,ingredients,instant food,laundry soap,medical supplies,packaged drink,packaged tea,playing cards,powder drink,sachet drink,snack,stationary,sweetend condensed milk,syrup,toys,womens hygiene supplies
merchant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,1.0,0.0,0.0,6.0,2.0,0.0,5.0,0.0,0.0,3.0,5.0,9.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,6.0,4.0,4.0,0.0,8.0,0.0,0.0,2.0,21.0,9.0,0.0,0.0,0.0,0.0,2.0
3,2.0,1.0,10.0,10.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,9.0,0.0,7.0,0.0,40.0,56.0,16.0,0.0,13.0,16.0,0.0,15.0,45.0,35.0,1.0,32.0,6.0,5.0,11.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,10.0,0.0,0.0,0.0,0.0,2.0,2.0,3.0,0.0,0.0,0.0,0.0,1.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,20.0,0.0,0.0,0.0,33.0,1.0,10.0,0.0,5.0,2.0,0.0,7.0,6.0,2.0,0.0,3.0,0.0,0.0,0.0
10,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,10.0,0.0,0.0,0.0,35.0,0.0,2.0,0.0,6.0,8.0,8.0,0.0,2.0,0.0,0.0,6.0,10.0,18.0,0.0,0.0,0.0,0.0,2.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,7.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


In [24]:
merchant_cat.tail(10)

top_cat_en,baby care,baby food,bath soap,body care,bottled water,breakfast food,canned food,carpentry tools,cream soap,dental care,dish soap,external medicine,floor washing soap,food materials,gas lighter,general medicine,hair care,household general supplis,household hygiene supplies,ingredients,instant food,laundry soap,medical supplies,packaged drink,packaged tea,playing cards,powder drink,sachet drink,snack,stationary,sweetend condensed milk,syrup,toys,womens hygiene supplies
merchant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
348,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,0.0,0.0,9.0,1.0,1.0,0.0,4.0,0.0,0.0,0.0,0.0,6.0,0.0,1.0,0.0,0.0,2.0
349,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,6.0,5.0,0.0,0.0,1.0,0.0,0.0,12.0,7.0,14.0,0.0,0.0,0.0,0.0,1.0
350,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,3.0,0.0,2.0,0.0,7.0,0.0,2.0,1.0,14.0,41.0,0.0,0.0,0.0,0.0,0.0,13.0,11.0,6.0,0.0,0.0,0.0,0.0,0.0
351,0.0,0.0,19.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,16.0,0.0,0.0,0.0,11.0,0.0,1.0,0.0,16.0,13.0,0.0,1.0,9.0,0.0,0.0,5.0,12.0,1.0,0.0,0.0,0.0,0.0,2.0
352,0.0,0.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,0.0,3.0,0.0,8.0,6.0,9.0,0.0,14.0,2.0,0.0,29.0,7.0,4.0,0.0,0.0,0.0,0.0,5.0
353,3.0,0.0,10.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,9.0,12.0,1.0,0.0,21.0,13.0,0.0,5.0,13.0,0.0,0.0,5.0,21.0,2.0,0.0,0.0,0.0,0.0,0.0
354,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,18.0,15.0,2.0,0.0,8.0,3.0,10.0,0.0,1.0,0.0,0.0,0.0,5.0,12.0,0.0,1.0,0.0,0.0,0.0
355,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0
356,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0
357,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,3.0,2.0,0.0,0.0,11.0,0.0,0.0,5.0,4.0,6.0,0.0,0.0,0.0,0.0,0.0


 - notice the merchant_id in the index  

In [113]:
merchant_cat.loc[3]

top_cat_en
baby care                      2.0
baby food                      1.0
bath soap                     10.0
body care                     10.0
bottled water                  0.0
breakfast food                 0.0
canned food                    0.0
carpentry tools                0.0
cream soap                     1.0
dental care                    0.0
dish soap                      0.0
external medicine              3.0
floor washing soap             0.0
food materials                 0.0
gas lighter                    0.0
general medicine               9.0
hair care                      0.0
household general supplis      7.0
household hygiene supplies     0.0
ingredients                   40.0
instant food                  56.0
laundry soap                  16.0
medical supplies               0.0
packaged drink                13.0
packaged tea                  16.0
playing cards                  0.0
powder drink                  15.0
sachet drink                  45.0
snack    

In [86]:
merchant_cat.corrwith(merchant_cat["baby food"],axis=0)

top_cat_en
baby care                     0.396975
baby food                     1.000000
bath soap                     0.256129
body care                     0.554916
bottled water                -0.032959
breakfast food                0.057388
canned food                   0.034068
carpentry tools               0.211680
cream soap                    0.213045
dental care                   0.450265
dish soap                     0.273086
external medicine             0.500080
floor washing soap            0.431750
food materials                0.295532
gas lighter                  -0.024515
general medicine              0.406682
hair care                     0.539239
household general supplis     0.397212
household hygiene supplies    0.351180
ingredients                   0.405955
instant food                  0.288383
laundry soap                  0.475457
medical supplies              0.195857
packaged drink                0.351627
packaged tea                  0.432563
playing cards 

In [89]:
merchant_cat.corrwith(merchant_cat["baby food"]).argmin()

4

In [118]:
cat_cat_corr = pd.DataFrame(np.zeros(merchant_cat.shape[1]**2).reshape([merchant_cat.shape[1],merchant_cat.shape[1]]), columns = merchant_cat.columns, index=merchant_cat.columns)


In [120]:
cat_cat_corr["baby food"]

top_cat_en
baby care                     0.0
baby food                     0.0
bath soap                     0.0
body care                     0.0
bottled water                 0.0
breakfast food                0.0
canned food                   0.0
carpentry tools               0.0
cream soap                    0.0
dental care                   0.0
dish soap                     0.0
external medicine             0.0
floor washing soap            0.0
food materials                0.0
gas lighter                   0.0
general medicine              0.0
hair care                     0.0
household general supplis     0.0
household hygiene supplies    0.0
ingredients                   0.0
instant food                  0.0
laundry soap                  0.0
medical supplies              0.0
packaged drink                0.0
packaged tea                  0.0
playing cards                 0.0
powder drink                  0.0
sachet drink                  0.0
snack                         0.0
sta

In [98]:
cat_cat_corr = pd.DataFrame(np.zeros(merchant_cat.shape[1]**2).reshape([merchant_cat.shape[1],merchant_cat.shape[1]]), columns = merchant_cat.columns, index=merchant_cat.columns)
for i in merchant_cat.columns:
    cat_cat_corr[i]= merchant_cat.corrwith(merchant_cat[i])
cat_cat_corr

top_cat_en,baby care,baby food,bath soap,body care,bottled water,breakfast food,canned food,carpentry tools,cream soap,dental care,dish soap,external medicine,floor washing soap,food materials,gas lighter,general medicine,hair care,household general supplis,household hygiene supplies,ingredients,instant food,laundry soap,medical supplies,packaged drink,packaged tea,playing cards,powder drink,sachet drink,snack,stationary,sweetend condensed milk,syrup,toys,womens hygiene supplies
top_cat_en,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
baby care,1.0,0.396975,0.42912,0.634286,-0.028287,0.121747,0.038023,0.254697,0.280109,0.506486,0.394041,0.481889,0.333864,0.276506,-0.012051,0.561693,0.675722,0.461543,0.597172,0.47121,0.369565,0.635651,0.366956,0.445339,0.286175,0.245426,0.24693,0.534393,0.607664,0.240415,0.620062,0.436453,0.292105,0.632554
baby food,0.396975,1.0,0.256129,0.554916,-0.032959,0.057388,0.034068,0.21168,0.213045,0.450265,0.273086,0.50008,0.43175,0.295532,-0.024515,0.406682,0.539239,0.397212,0.35118,0.405955,0.288383,0.475457,0.195857,0.351627,0.432563,0.028049,0.163759,0.4742,0.472162,0.220575,0.490765,0.48614,0.081129,0.463957
bath soap,0.42912,0.256129,1.0,0.586018,0.31525,0.146864,0.271885,0.382762,0.339904,0.550965,0.502567,0.356403,0.48671,0.23644,0.381966,0.536326,0.4435,0.608718,0.418073,0.572721,0.446195,0.610728,0.281052,0.495931,0.448329,0.125664,0.439868,0.608294,0.600862,0.464209,0.652132,0.166001,0.097428,0.357376
body care,0.634286,0.554916,0.586018,1.0,0.034469,0.077678,0.095329,0.299216,0.418053,0.715938,0.502836,0.581634,0.501509,0.392706,0.056159,0.662165,0.785091,0.661178,0.659819,0.693779,0.563526,0.752631,0.316498,0.601675,0.511609,0.12519,0.493223,0.821326,0.782928,0.521343,0.704383,0.458364,0.32738,0.599876
bottled water,-0.028287,-0.032959,0.31525,0.034469,1.0,-0.016208,0.532108,-0.011692,-0.014162,0.008883,-0.022018,0.051882,-0.007415,-0.009434,0.815594,0.329878,0.009537,-0.007316,0.144469,0.098299,0.031553,0.056648,-0.008572,0.148457,0.015465,-0.011033,0.352462,0.115301,0.012034,0.227201,0.040603,-0.038009,-0.018108,-0.029099
breakfast food,0.121747,0.057388,0.146864,0.077678,-0.016208,1.0,-0.010645,0.16122,0.167128,0.084663,0.004858,0.094138,0.055343,0.094568,-0.010651,0.059587,0.084509,0.12578,0.151087,0.126466,0.070119,0.133741,0.101196,0.072014,0.080583,-0.029637,0.022489,0.124043,0.104,0.059091,0.093816,0.176451,-0.033631,0.063098
canned food,0.038023,0.034068,0.271885,0.095329,0.532108,-0.010645,1.0,-0.007687,0.022882,0.135017,0.069147,0.180663,0.037071,0.102421,0.656921,0.317651,0.038012,0.010351,0.225363,0.196452,0.052761,0.134321,0.109081,0.326119,0.053204,-0.008975,0.299692,0.193493,0.0834,0.182632,0.096834,-0.020562,-0.00858,0.07842
carpentry tools,0.254697,0.21168,0.382762,0.299216,-0.011692,0.16122,-0.007687,1.0,0.16343,0.534703,0.198858,0.388714,0.373845,0.245213,-0.012212,0.303479,0.175515,0.574169,0.232623,0.39128,0.329444,0.429717,0.157089,0.276287,0.102037,-0.018428,0.409426,0.375094,0.377912,0.406239,0.402863,0.135589,-0.01685,0.152312
cream soap,0.280109,0.213045,0.339904,0.418053,-0.014162,0.167128,0.022882,0.16343,1.0,0.387082,0.355452,0.30091,0.300514,0.278256,-0.009027,0.230772,0.38101,0.355522,0.269526,0.388788,0.276017,0.441454,0.157689,0.296533,0.219419,0.132559,0.137046,0.37012,0.322394,0.229128,0.373508,0.233154,0.062268,0.270975
dental care,0.506486,0.450265,0.550965,0.715938,0.008883,0.084663,0.135017,0.534703,0.387082,1.0,0.450743,0.570244,0.609413,0.566984,0.018501,0.506871,0.544103,0.751676,0.509367,0.686967,0.57676,0.746731,0.330415,0.513747,0.362973,0.075787,0.48943,0.779102,0.772121,0.498541,0.660625,0.269385,0.132642,0.408846


In [90]:
def cosine_similarity(v1, v2):
    return np.dot(v1,v2) / math.sqrt(np.dot(v1,v1) * np.dot(v2,v2))
cosine_similarity(v1=[1, 20.0, 9.0], v2=[4, 5.0, 6])

0.8201403991329932

In [121]:
cosine_similarity(merchant_cat["baby food"], merchant_cat["body care"])

0.643400424822213

In [92]:
cat_cat_cos = pd.DataFrame(np.zeros(merchant_cat.shape[1]**2).reshape([merchant_cat.shape[1],merchant_cat.shape[1]]), columns = merchant_cat.columns, index=merchant_cat.columns)
cat_cat_cos.head(10)

top_cat_en,baby care,baby food,bath soap,body care,bottled water,breakfast food,canned food,carpentry tools,cream soap,dental care,dish soap,external medicine,floor washing soap,food materials,gas lighter,general medicine,hair care,household general supplis,household hygiene supplies,ingredients,instant food,laundry soap,medical supplies,packaged drink,packaged tea,playing cards,powder drink,sachet drink,snack,stationary,sweetend condensed milk,syrup,toys,womens hygiene supplies
top_cat_en,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
baby care,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
baby food,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
bath soap,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
body care,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
bottled water,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
breakfast food,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
canned food,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
carpentry tools,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cream soap,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
dental care,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [93]:
for i in merchant_cat.columns:
    for j in merchant_cat.columns:
        cat_cat_cos[i][j] = cosine_similarity(merchant_cat[i],merchant_cat[j])
cat_cat_cos

top_cat_en,baby care,baby food,bath soap,body care,bottled water,breakfast food,canned food,carpentry tools,cream soap,dental care,dish soap,external medicine,floor washing soap,food materials,gas lighter,general medicine,hair care,household general supplis,household hygiene supplies,ingredients,instant food,laundry soap,medical supplies,packaged drink,packaged tea,playing cards,powder drink,sachet drink,snack,stationary,sweetend condensed milk,syrup,toys,womens hygiene supplies
top_cat_en,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
baby care,1.0,0.518093,0.518121,0.720958,0.023034,0.202164,0.085742,0.346284,0.362601,0.606885,0.48687,0.600046,0.445397,0.356043,0.016845,0.672577,0.752105,0.582102,0.692899,0.592674,0.481504,0.72918,0.490131,0.566703,0.405192,0.281708,0.3897,0.655856,0.698843,0.360301,0.71931,0.559574,0.361997,0.718333
baby food,0.518093,1.0,0.360655,0.6434,0.010896,0.134911,0.075877,0.298256,0.295087,0.547127,0.372953,0.596808,0.51473,0.364641,0.001135,0.532664,0.630524,0.512339,0.48189,0.521924,0.398917,0.587562,0.331325,0.47306,0.514558,0.084723,0.300552,0.587768,0.576005,0.327876,0.600893,0.5821,0.169163,0.569839
bath soap,0.518121,0.360655,1.0,0.64695,0.328282,0.203722,0.29099,0.438741,0.396921,0.613369,0.558978,0.455469,0.54858,0.300618,0.37772,0.607723,0.529539,0.664754,0.509128,0.635347,0.516351,0.667264,0.3793,0.569603,0.514251,0.165538,0.51402,0.665138,0.658874,0.524933,0.699644,0.292447,0.168662,0.457994
body care,0.720958,0.6434,0.64695,1.0,0.077526,0.164359,0.135457,0.38363,0.478925,0.772586,0.576399,0.67688,0.582224,0.454247,0.076302,0.747314,0.835642,0.736779,0.740547,0.763967,0.638549,0.815582,0.44967,0.688489,0.589451,0.177731,0.587403,0.866666,0.833285,0.591577,0.780983,0.576516,0.391911,0.693163
bottled water,0.023034,0.010896,0.328282,0.077526,1.0,0.003867,0.537029,0.015327,0.011741,0.049365,0.011937,0.091417,0.027771,0.015453,0.815896,0.331111,0.055575,0.038542,0.173194,0.132375,0.065678,0.099725,0.031737,0.175942,0.048765,0.003092,0.360972,0.149365,0.057177,0.245763,0.087043,0.010194,0.005349,0.02144
breakfast food,0.202164,0.134911,0.203722,0.164359,0.003867,1.0,0.011412,0.206597,0.210291,0.160263,0.071577,0.175738,0.122048,0.140178,0.00088,0.154707,0.169749,0.202007,0.22734,0.204151,0.138941,0.217937,0.172039,0.153629,0.144613,0.0,0.099305,0.21091,0.185547,0.121624,0.186483,0.24488,0.015045,0.150688
canned food,0.085742,0.075877,0.29099,0.135457,0.537029,0.011412,1.0,0.022019,0.050043,0.167414,0.1009,0.209384,0.072849,0.125733,0.658108,0.325903,0.085418,0.058971,0.248377,0.223264,0.089124,0.170646,0.142956,0.3382,0.08756,0.006628,0.316501,0.220351,0.124518,0.207253,0.139634,0.030441,0.017032,0.120305
carpentry tools,0.346284,0.298256,0.438741,0.38363,0.015327,0.206597,0.022019,1.0,0.222298,0.581121,0.270931,0.45767,0.432673,0.296581,0.003502,0.390735,0.279088,0.615509,0.327825,0.460081,0.395557,0.494573,0.248041,0.359668,0.187319,0.021199,0.468992,0.450395,0.449147,0.45908,0.472966,0.238635,0.047724,0.258737
cream soap,0.362601,0.295087,0.396921,0.478925,0.011741,0.210291,0.050043,0.222298,1.0,0.448459,0.410125,0.378263,0.363352,0.32524,0.005858,0.325409,0.447506,0.424212,0.35375,0.453432,0.34422,0.498989,0.244187,0.372528,0.289546,0.163749,0.223611,0.44091,0.397395,0.29507,0.443758,0.317783,0.11915,0.354
dental care,0.606885,0.547127,0.613369,0.772586,0.049365,0.160263,0.167414,0.581121,0.448459,1.0,0.526115,0.654698,0.666342,0.603501,0.040491,0.612486,0.636114,0.799394,0.609272,0.748429,0.642892,0.797988,0.444929,0.606196,0.456597,0.128384,0.574025,0.822624,0.816709,0.566893,0.732396,0.408323,0.215833,0.527914


In [94]:
merchant_cat.columns[ merchant_cat.corrwith(merchant_cat["baby food"]).argmin()]

'bottled water'

In [122]:
cat_cat_cos.idxmax(axis=1)

top_cat_en
baby care                                      baby care
baby food                                      baby food
bath soap                                      bath soap
body care                                      body care
bottled water                              bottled water
breakfast food                            breakfast food
canned food                                  canned food
carpentry tools                          carpentry tools
cream soap                                    cream soap
dental care                                  dental care
dish soap                                      dish soap
external medicine                      external medicine
floor washing soap                    floor washing soap
food materials                            food materials
gas lighter                                  gas lighter
general medicine                        general medicine
hair care                                      hair care
household general su

### need to drop "gas lighter" category

In [124]:
cat_cat_cos["sachet drink"].nlargest(n=10)

top_cat_en
sachet drink                  1.000000
body care                     0.866666
snack                         0.850739
dental care                   0.822624
ingredients                   0.810975
laundry soap                  0.801544
sweetend condensed milk       0.800809
general medicine              0.757674
household general supplis     0.742071
household hygiene supplies    0.728934
Name: sachet drink, dtype: float64

In [125]:
cat_cat_corr["baby food"].nlargest(n=10)

top_cat_en
baby care          0.0
baby food          0.0
bath soap          0.0
body care          0.0
bottled water      0.0
breakfast food     0.0
canned food        0.0
carpentry tools    0.0
cream soap         0.0
dental care        0.0
Name: baby food, dtype: float64

In [54]:
merchant_cat.index.names

FrozenList(['merchant_id'])

In [51]:
merchant_id = merchant_cat.index.values
merchant_id[348]

357

In [70]:
merchant_cat.loc[357]

top_cat_en
baby care                      1.0
baby food                      0.0
bath soap                      0.0
body care                      0.0
bottled water                  0.0
breakfast food                 0.0
canned food                    0.0
carpentry tools                0.0
cream soap                     0.0
dental care                    0.0
dish soap                      0.0
external medicine              0.0
floor washing soap             0.0
food materials                 0.0
gas lighter                    0.0
general medicine               0.0
hair care                      0.0
household general supplis      0.0
household hygiene supplies    10.0
ingredients                    3.0
instant food                   2.0
laundry soap                   0.0
medical supplies               0.0
packaged drink                11.0
packaged tea                   0.0
playing cards                  0.0
powder drink                   5.0
sachet drink                   4.0
snack    

In [73]:
m_id = 357
merchant_cat[merchant_cat.index == m_id]

top_cat_en,baby care,baby food,bath soap,body care,bottled water,breakfast food,canned food,carpentry tools,cream soap,dental care,dish soap,external medicine,floor washing soap,food materials,gas lighter,general medicine,hair care,household general supplis,household hygiene supplies,ingredients,instant food,laundry soap,medical supplies,packaged drink,packaged tea,playing cards,powder drink,sachet drink,snack,stationary,sweetend condensed milk,syrup,toys,womens hygiene supplies
merchant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
357,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,3.0,2.0,0.0,0.0,11.0,0.0,0.0,5.0,4.0,6.0,0.0,0.0,0.0,0.0,0.0


In [99]:
merchant_cat[merchant_cat.index == m_id].values.ravel()

array([ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0., 10.,  3.,  2.,  0.,  0., 11.,  0.,  0.,
        5.,  4.,  6.,  0.,  0.,  0.,  0.,  0.])

In [100]:
cosine_similarity(merchant_cat[merchant_cat.index == 357].values.ravel(),merchant_cat[merchant_cat.index == 1].values.ravel())

0.5798335076138013

### in class: compute the similarity matrix for merchant-merchant
 - using cosine similarity
 - double check with pandas corrwith