# Customers Who Bought This Frequently Buy This!
In this notebook we will explore which items were frequently purchased together. Using this information, we can predict which items a customer will buy after we observe what they have already bought!

In [1]:
import cudf, gc
import numpy as np
import cv2, matplotlib.pyplot as plt
from os.path import exists
print('RAPIDS version',cudf.__version__)

RAPIDS version 21.10.01


# Load Tranactions

In [2]:
#log_date = '2020-09-23'
#log_date = '2020-09-16'
#log_date = '2020-09-09'
log_date = '2020-09-02'
Ntop = 2

In [3]:
# LOAD TRANSACTIONS DATAFRAME
df = cudf.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv')
print('Transactions shape',df.shape)

df.t_dat = cudf.to_datetime(df.t_dat)
log_date_dt = cudf.to_datetime(log_date)
df = df[df.t_dat < log_date_dt]

Transactions shape (31788324, 5)


In [4]:
# REDUCE MEMORY OF DATAFRAME
df = df[['customer_id','article_id']]
df.customer_id = df.customer_id.str[-16:].str.hex_to_int().astype('int64')
df.article_id = df.article_id.astype('int32')
display( df.head() )

_ = gc.collect()

Unnamed: 0,customer_id,article_id
0,-6846340800584936,663713001
1,-6846340800584936,541518023
2,-8334631767138808638,505221004
3,-8334631767138808638,685687003
4,-8334631767138808638,685687004


# Find Items Purchased Together
We will use RAPID cuDF to speed up the dataframe search commands below

In [5]:
# FIND ITEMS PURCHASED TOGETHER
vc = df.article_id.value_counts()
# 少なすぎるものは削る。vc>50でだいたい半分くらいになる
vc = vc[vc>100]
vc

706016001    48822
706016002    34298
372860001    30956
610776002    29497
759871002    26238
             ...  
894674006      101
695166019      101
708274002      101
720800001      101
694131021      101
Name: article_id, Length: 43421, dtype: int32

In [6]:
# FIND ITEMS PURCHASED TOGETHER
pairs = {}
pairs_np = {} # cudfでなくても読める用
for j,i in enumerate(vc.index.values):
#for j,i in enumerate(vc.index.values[:100]):
    if j%100==0: print(j,', ',end='')
    USERS = df.loc[df.article_id==i.item(),'customer_id'].unique() # 特定のarticleを買っているcustomer
    vc2 = df.loc[(df.customer_id.isin(USERS))&(df.article_id!=i.item()),'article_id'].value_counts() # 自分自身を除く、同時に買われたarticle
    pairs[i.item()] = vc2.index[:Ntop] 
    #pairs_np[i.item()] = vc2.index[:Ntop].to_array()

0 , 100 , 200 , 300 , 400 , 500 , 600 , 700 , 800 , 900 , 1000 , 1100 , 1200 , 1300 , 1400 , 1500 , 1600 , 1700 , 1800 , 1900 , 2000 , 2100 , 2200 , 2300 , 2400 , 2500 , 2600 , 2700 , 2800 , 2900 , 3000 , 3100 , 3200 , 3300 , 3400 , 3500 , 3600 , 3700 , 3800 , 3900 , 4000 , 4100 , 4200 , 4300 , 4400 , 4500 , 4600 , 4700 , 4800 , 4900 , 5000 , 5100 , 5200 , 5300 , 5400 , 5500 , 5600 , 5700 , 5800 , 5900 , 6000 , 6100 , 6200 , 6300 , 6400 , 6500 , 6600 , 6700 , 6800 , 6900 , 7000 , 7100 , 7200 , 7300 , 7400 , 7500 , 7600 , 7700 , 7800 , 7900 , 8000 , 8100 , 8200 , 8300 , 8400 , 8500 , 8600 , 8700 , 8800 , 8900 , 9000 , 9100 , 9200 , 9300 , 9400 , 9500 , 9600 , 9700 , 9800 , 9900 , 10000 , 10100 , 10200 , 10300 , 10400 , 10500 , 10600 , 10700 , 10800 , 10900 , 11000 , 11100 , 11200 , 11300 , 11400 , 11500 , 11600 , 11700 , 11800 , 11900 , 12000 , 12100 , 12200 , 12300 , 12400 , 12500 , 12600 , 12700 , 12800 , 12900 , 13000 , 13100 , 13200 , 13300 , 13400 , 13500 , 13600 , 13700 , 13800 , 

In [7]:
sum_pair = len(pairs)
s = 0
for v in pairs.values():
    s += len(v)
print(s/sum_pair) # mapの長さの平均値

2.0


In [8]:
np.save(f'pairs_cudf_{log_date}_{Ntop}.npy', pairs)