### Generate Training data
- 学習データ作成用notebook
- 素性も同時に生成する


In [53]:
import numpy as np
import pandas as pd
import gc
import os
import time
import random
from tqdm.auto import tqdm
import datetime

In [54]:
def visualize_df(df):
    print(df.shape)
    display(df.head())

## Parameters

In [55]:
train_start_date = '2020-09-09' # 学習データの開始日
negative_num = 100 # negative samplingの数

In [118]:
# 外部から回す場合はここでパラメータ上書き

In [57]:
from datetime import datetime, date, timedelta
train_end_date = pd.to_datetime(train_start_date) + timedelta(days=7)
feature_date = pd.to_datetime(train_start_date) + timedelta(days=-7)

In [58]:
train_end_date = str(train_end_date.strftime('%Y-%m-%d'))
feature_date = str(feature_date.strftime('%Y-%m-%d'))

In [59]:
print(train_start_date, train_end_date, feature_date)

2020-09-09 2020-09-16 2020-09-02


## Read Data

In [60]:
# データの読み込み
dir = 'h-and-m-personalized-fashion-recommendations/'
path = '../input/' + dir 

transactions_train = pd.read_csv(path + 'transactions_train.csv')

In [61]:
# save memory https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
# 64文字もあるが、末尾16文字だけでcustomerを一意にid可能
# 使われている文字は0-9, a-fなので16進数で変換してintで扱える
# 16進数 = 4bit, 4bit * 16 / 8 = 8 byte
# 64 byte -> 8 byteに削減, int64で表現可能
transactions_train['customer_id'] = transactions_train['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')

In [62]:
# 提出の形式に合わせてarticle_idの最初に0を加える
#transactions_train['article_id'] = transactions_train['article_id'].map(lambda x: '0' + str(x))
# メモリ削減、復元するには上記の式
transactions_train['article_id'] = transactions_train['article_id'].astype('int32')

In [63]:
transactions_train['t_dat'] = pd.to_datetime(transactions_train['t_dat'])
transactions_train.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,-6846340800584936,663713001,0.050831,2
1,2018-09-20,-6846340800584936,541518023,0.030492,2
2,2018-09-20,-8334631767138808638,505221004,0.015237,2
3,2018-09-20,-8334631767138808638,685687003,0.016932,2
4,2018-09-20,-8334631767138808638,685687004,0.016932,2


In [64]:
# transactions_train['t_dat'].unique()[-7:]

# array(['2020-09-16', '2020-09-17', '2020-09-18', '2020-09-19',
#       '2020-09-20', '2020-09-21', '2020-09-22'], dtype=object)

In [65]:
train_data = transactions_train.query(f"'{train_start_date}' <= t_dat and t_dat < '{train_end_date}'").reset_index(drop=True)
visualize_df(train_data)

(255241, 5)


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2020-09-09,-7604547624187760215,399136061,0.08339,2
1,2020-09-09,-7604547624187760215,732842014,0.066712,2
2,2020-09-09,-7604547624187760215,556255001,0.01,2
3,2020-09-09,-7604547624187760215,852219003,0.008322,2
4,2020-09-09,-7604547624187760215,732842021,0.066712,2


In [66]:
feature_data = transactions_train.query(f" t_dat < '{train_start_date}'").reset_index(drop=True) # train feature

In [67]:
del transactions_train; gc.collect()

6

In [68]:
train_data

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2020-09-09,-7604547624187760215,399136061,0.083390,2
1,2020-09-09,-7604547624187760215,732842014,0.066712,2
2,2020-09-09,-7604547624187760215,556255001,0.010000,2
3,2020-09-09,-7604547624187760215,852219003,0.008322,2
4,2020-09-09,-7604547624187760215,732842021,0.066712,2
...,...,...,...,...,...
255236,2020-09-15,-977760742639762210,850917001,0.025407,1
255237,2020-09-15,38700952482392720,853316001,0.008458,1
255238,2020-09-15,38700952482392720,296366006,0.000847,1
255239,2020-09-15,38700952482392720,789769001,0.013542,1


In [69]:
feature_data

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,-6846340800584936,663713001,0.050831,2
1,2018-09-20,-6846340800584936,541518023,0.030492,2
2,2018-09-20,-8334631767138808638,505221004,0.015237,2
3,2018-09-20,-8334631767138808638,685687003,0.016932,2
4,2018-09-20,-8334631767138808638,685687004,0.016932,2
...,...,...,...,...,...
31292767,2020-09-08,4685485978980270934,919786002,0.042356,2
31292768,2020-09-08,795398326275572276,765308002,0.033881,2
31292769,2020-09-08,-8286316756823862684,689365050,0.010017,2
31292770,2020-09-08,-8286316756823862684,884081001,0.012898,2


## Add labels

### generate positive data

In [70]:
# 正解データのsetを作成
train_group = train_data.groupby('customer_id')['article_id'].apply(set).reset_index(name='rec_set')
visualize_df(train_group)

(72019, 2)


Unnamed: 0,customer_id,rec_set
0,-9223343869995384291,"{908292002, 903926002, 910601003, 865929007}"
1,-9223290575350349271,"{905518001, 912204009, 852584006, 757904007}"
2,-9223120303940804728,{866792001}
3,-9222973878991087276,"{835704001, 832884002, 783517002, 843777005, 7..."
4,-9222798684148120334,"{851374001, 806388018, 896064004, 854663002}"


### negative sampling

In [71]:
row_num = train_group.shape[0]
row_num

72019

In [72]:
negative = train_data.sample(negative_num * row_num, replace=True).reset_index(drop=True)
negative['index2'] = negative.index.astype(int)
negative['index2'] = negative['index2'] % row_num

In [73]:
negative = negative[['index2', 'article_id']]
negative

Unnamed: 0,index2,article_id
0,0,500435128
1,1,836142002
2,2,762143001
3,3,834906013
4,4,936012001
...,...,...
7201895,72014,903868001
7201896,72015,158340001
7201897,72016,929165002
7201898,72017,783346001


In [74]:
negative = negative.groupby('index2')['article_id'].apply(set).reset_index(name='negative_set')
negative = negative.drop('index2', axis='columns')
visualize_df(negative)

(72019, 1)


Unnamed: 0,negative_set
0,"{827968001, 878293001, 892309002, 885077003, 7..."
1,"{868053003, 892309003, 845610003, 905492001, 9..."
2,"{827968001, 739819010, 861803009, 804992014, 9..."
3,"{640001027, 579541001, 884245001, 792469001, 8..."
4,"{927957001, 909802001, 903359001, 913087001, 7..."


In [75]:
negative['negative_set'].apply(len).describe()

count    72019.000000
mean        97.951402
std          1.420884
min         89.000000
25%         97.000000
50%         98.000000
75%         99.000000
max        100.000000
Name: negative_set, dtype: float64

### Join positive and negative

In [76]:
train_group['rec_set'].apply(len).describe()

count    72019.000000
mean         3.164582
std          2.929903
min          1.000000
25%          1.000000
50%          2.000000
75%          4.000000
max         47.000000
Name: rec_set, dtype: float64

In [77]:
train_group = pd.concat([train_group, negative], axis=1)
del train_data, negative; gc.collect()
visualize_df(train_group)

(72019, 3)


Unnamed: 0,customer_id,rec_set,negative_set
0,-9223343869995384291,"{908292002, 903926002, 910601003, 865929007}","{827968001, 878293001, 892309002, 885077003, 7..."
1,-9223290575350349271,"{905518001, 912204009, 852584006, 757904007}","{868053003, 892309003, 845610003, 905492001, 9..."
2,-9223120303940804728,{866792001},"{827968001, 739819010, 861803009, 804992014, 9..."
3,-9222973878991087276,"{835704001, 832884002, 783517002, 843777005, 7...","{640001027, 579541001, 884245001, 792469001, 8..."
4,-9222798684148120334,"{851374001, 806388018, 896064004, 854663002}","{927957001, 909802001, 903359001, 913087001, 7..."


In [78]:
train_group_pos = train_group[['customer_id', 'rec_set']].explode('rec_set')
train_group_pos['label'] = 1
train_group_pos

Unnamed: 0,customer_id,rec_set,label
0,-9223343869995384291,908292002,1
0,-9223343869995384291,903926002,1
0,-9223343869995384291,910601003,1
0,-9223343869995384291,865929007,1
1,-9223290575350349271,905518001,1
...,...,...,...
72017,9221907720696052119,892558005,1
72018,9221980340157146437,767577045,1
72018,9221980340157146437,781758057,1
72018,9221980340157146437,395127001,1


In [79]:
train_group_neg = train_group[['customer_id', 'negative_set']].explode('negative_set')
train_group_neg = train_group_neg.rename(columns={'negative_set':'rec_set'})
train_group_neg['label'] = 0
train_group_neg

Unnamed: 0,customer_id,rec_set,label
0,-9223343869995384291,827968001,0
0,-9223343869995384291,878293001,0
0,-9223343869995384291,892309002,0
0,-9223343869995384291,885077003,0
0,-9223343869995384291,750422031,0
...,...,...,...
72018,9221980340157146437,898818003,0
72018,9221980340157146437,882882002,0
72018,9221980340157146437,908417001,0
72018,9221980340157146437,918443002,0


In [80]:
train_group = pd.concat([train_group_pos, train_group_neg])
del train_group_pos, train_group_neg; gc.collect()

0

In [81]:
train_group = train_group.rename(columns={'rec_set':'article_id'})

In [82]:
train_group

Unnamed: 0,customer_id,article_id,label
0,-9223343869995384291,908292002,1
0,-9223343869995384291,903926002,1
0,-9223343869995384291,910601003,1
0,-9223343869995384291,865929007,1
1,-9223290575350349271,905518001,1
...,...,...,...
72018,9221980340157146437,898818003,0
72018,9221980340157146437,882882002,0
72018,9221980340157146437,908417001,0
72018,9221980340157146437,918443002,0


In [83]:
# negative samplingでたまたま選ばれた正解データはlabel=1にまとめる
train_group = train_group.groupby(['customer_id', 'article_id'])['label'].agg(sum).reset_index(name='label')
train_group

Unnamed: 0,customer_id,article_id,label
0,-9223343869995384291,215589002,0
1,-9223343869995384291,499334001,0
2,-9223343869995384291,500435128,0
3,-9223343869995384291,610776002,0
4,-9223343869995384291,673475003,0
...,...,...,...
7272989,9221980340157146437,923037003,0
7272990,9221980340157146437,924243001,0
7272991,9221980340157146437,932365008,0
7272992,9221980340157146437,933989002,0


## Customer features

- customer_idごとに素性を作成

In [84]:
customers = pd.read_csv(path + 'customers.csv')
customers

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...
...,...,...,...,...,...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,,,ACTIVE,NONE,24.0,7aa399f7e669990daba2d92c577b52237380662f36480b...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,,,ACTIVE,NONE,21.0,3f47f1279beb72215f4de557d950e0bfa73789d24acb5e...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,1.0,1.0,ACTIVE,Regularly,21.0,4563fc79215672cd6a863f2b4bf56b8f898f2d96ed590e...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,1.0,1.0,ACTIVE,Regularly,18.0,8892c18e9bc3dca6aa4000cb8094fc4b51ee8db2ed14d7...


In [85]:
print(customers.memory_usage())
print(customers.dtypes)

Index                          128
customer_id               10975840
FN                        10975840
Active                    10975840
club_member_status        10975840
fashion_news_frequency    10975840
age                       10975840
postal_code               10975840
dtype: int64
customer_id                object
FN                        float64
Active                    float64
club_member_status         object
fashion_news_frequency     object
age                       float64
postal_code                object
dtype: object


In [86]:
customers['customer_id'] = customers['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')
customers['age'] = customers['age'].fillna(0).astype('int8')
customers['FN'] = customers['FN'].fillna(0).astype('bool')
customers['Active'] = customers['Active'].fillna(0).astype('bool')
customers['club_member_status'] = customers['club_member_status'].fillna('NA').astype('category').cat.codes
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].fillna('NA').astype('category').cat.codes
# 長いのでuniqueになるようcategory型にした後に番号をふる
customers['postal_code'] = customers['postal_code'].fillna('NA').astype('category').cat.codes
#customers['postal_code'] = customers['postal_code'].fillna('NA').astype('category')

In [87]:
print(customers.memory_usage())
print(customers.dtypes)

Index                          128
customer_id               10975840
FN                         1371980
Active                     1371980
club_member_status         1371980
fashion_news_frequency     1371980
age                        1371980
postal_code                5487920
dtype: int64
customer_id               int64
FN                         bool
Active                     bool
club_member_status         int8
fashion_news_frequency     int8
age                        int8
postal_code               int32
dtype: object


In [88]:
print(customers['club_member_status'].value_counts())
print(customers['club_member_status'].value_counts())
print(customers['fashion_news_frequency'].value_counts())
print(customers['postal_code'].value_counts)

0    1272491
3      92960
2       6062
1        467
Name: club_member_status, dtype: int64
0    1272491
3      92960
2       6062
1        467
Name: club_member_status, dtype: int64
2    877711
4    477416
1     16009
0       842
3         2
Name: fashion_news_frequency, dtype: int64
<bound method IndexOpsMixin.value_counts of 0          112978
1           57312
2          139156
3          128529
4           52371
            ...  
1371975    169171
1371976     87255
1371977     95707
1371978    188279
1371979     13927
Name: postal_code, Length: 1371980, dtype: int32>


In [89]:
customers

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,6883939031699146327,False,False,0,2,49,112978
1,-7200416642310594310,False,False,0,2,25,57312
2,-6846340800584936,False,False,0,2,24,139156
3,-94071612138601410,False,False,0,2,54,128529
4,-283965518499174310,True,True,0,4,52,52371
...,...,...,...,...,...,...,...
1371975,7551062398649767985,False,False,0,2,24,169171
1371976,-9141402131989464905,False,False,0,2,21,87255
1371977,-8286316756823862684,True,True,0,4,21,95707
1371978,2551401172826382186,True,True,0,4,18,188279


## Article features

In [90]:
articles = pd.read_csv(path + 'articles.csv')
articles.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [91]:
print(articles.memory_usage())
print(articles.dtypes)

Index                              128
article_id                      844336
product_code                    844336
prod_name                       844336
product_type_no                 844336
product_type_name               844336
product_group_name              844336
graphical_appearance_no         844336
graphical_appearance_name       844336
colour_group_code               844336
colour_group_name               844336
perceived_colour_value_id       844336
perceived_colour_value_name     844336
perceived_colour_master_id      844336
perceived_colour_master_name    844336
department_no                   844336
department_name                 844336
index_code                      844336
index_name                      844336
index_group_no                  844336
index_group_name                844336
section_no                      844336
section_name                    844336
garment_group_no                844336
garment_group_name              844336
detail_desc              

In [92]:
articles['article_id'] = articles['article_id'].astype('int32')

In [93]:
# idがあるカラムのnameは消す
drop_list = ['product_type_name', 
             'graphical_appearance_name', 
             'colour_group_name', 
             'perceived_colour_value_name', 
             'perceived_colour_master_name',
             'department_name',
             'index_name',
             'index_group_name',
             'section_name',
             'garment_group_name',
             'product_group_name',
             'prod_name',
             'detail_desc'
            ]

articles = articles.drop(drop_list, axis='columns')

In [94]:
articles['product_code']                = articles['product_code'].fillna(0).astype('int32')                
articles['product_type_no']             = articles['product_type_no'].astype('int32')               
articles['graphical_appearance_no']     = articles['graphical_appearance_no'].astype('int32') 
articles['colour_group_code']           = articles['colour_group_code'].astype('int32')   
articles['perceived_colour_value_id']   = articles['perceived_colour_value_id'].astype('int32')     
articles['perceived_colour_master_id']  = articles['perceived_colour_master_id'].astype('int32')    
articles['department_no']               = articles['department_no'].astype('category').cat.codes                 
articles['index_code']                  = articles['index_code'].astype('category').cat.codes                 
articles['index_group_no']              = articles['index_group_no'].astype('int8')                
articles['section_no']                  = articles['section_no'].astype('int8')                    
articles['garment_group_no']            = articles['garment_group_no'].astype('int16')              

In [95]:
articles

Unnamed: 0,article_id,product_code,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no
0,108775015,108775,253,1010016,9,4,5,47,0,1,16,1002
1,108775044,108775,253,1010016,10,3,9,47,0,1,16,1002
2,108775051,108775,253,1010017,11,1,9,47,0,1,16,1002
3,110065001,110065,306,1010016,9,4,5,12,1,1,61,1017
4,110065002,110065,306,1010016,10,3,9,12,1,1,61,1017
...,...,...,...,...,...,...,...,...,...,...,...,...
105537,953450001,953450,302,1010014,9,4,5,224,4,3,26,1021
105538,953763001,953763,253,1010016,9,4,5,66,0,1,2,1005
105539,956217002,956217,265,1010016,9,4,5,34,0,1,18,1005
105540,957375001,957375,72,1010016,9,4,5,120,3,2,52,1019


## dynamic features

### train

In [96]:
feature_data.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,-6846340800584936,663713001,0.050831,2
1,2018-09-20,-6846340800584936,541518023,0.030492,2
2,2018-09-20,-8334631767138808638,505221004,0.015237,2
3,2018-09-20,-8334631767138808638,685687003,0.016932,2
4,2018-09-20,-8334631767138808638,685687004,0.016932,2


In [97]:
joined_feature = pd.merge(feature_data, customers, on='customer_id', how='left')
joined_feature = pd.merge(joined_feature, articles, on='article_id', how='left')
del feature_data; gc.collect()

0

In [98]:
joined_feature.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,FN,Active,club_member_status,fashion_news_frequency,age,...,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no
0,2018-09-20,-6846340800584936,663713001,0.050831,2,False,False,0,2,24,...,283,1010016,9,4,5,11,1,1,61,1017
1,2018-09-20,-6846340800584936,541518023,0.030492,2,False,False,0,2,24,...,306,1010016,51,1,4,9,1,1,61,1017
2,2018-09-20,-8334631767138808638,505221004,0.015237,2,True,True,0,4,32,...,252,1010010,52,2,4,203,3,2,58,1003
3,2018-09-20,-8334631767138808638,685687003,0.016932,2,True,True,0,4,32,...,252,1010010,52,7,4,87,0,1,15,1023
4,2018-09-20,-8334631767138808638,685687004,0.016932,2,True,True,0,4,32,...,252,1010010,93,4,19,87,0,1,15,1023


In [99]:
joined_recent = joined_feature.query(f"'{feature_date}' <= t_dat").reset_index(drop=True)

### dynamic customer features

In [100]:
#joined_feature.groupby('customer_id')['article_id'].apply(set).reset_index(name='rec_set') # ユーザーごとの履歴

In [101]:
# customerごとの平均購入価格, key: customer_id
c_mean_price = joined_feature.groupby('customer_id')['price'].agg('mean').reset_index(name='c_mean_price')
customers = pd.merge(customers, c_mean_price, on='customer_id', how='left')
customers['c_mean_price'] = customers['c_mean_price'].fillna(0).astype('float32')                
del c_mean_price; gc.collect()
customers['c_mean_price'].head()

0    0.030904
1    0.030255
2    0.037869
3    0.030492
4    0.036130
Name: c_mean_price, dtype: float32

In [102]:
# customerごとの合計cv数, key: customer_id
c_cv_total = joined_feature.groupby('customer_id').size().reset_index(name='c_cv_total')
customers = pd.merge(customers, c_cv_total, on='customer_id', how='left')
del c_cv_total; gc.collect()
customers['c_cv_total'] = customers['c_cv_total'].fillna(0).astype('int32')
customers['c_cv_total'].head()

0    21
1    86
2    17
3     2
4    13
Name: c_cv_total, dtype: int32

In [103]:
# 直近1週間、customerごとの合計cv数, key: customer_id
c_cv_recent = joined_recent.groupby('customer_id').size().reset_index(name='c_cv_recent')
customers = pd.merge(customers, c_cv_recent, on='customer_id', how='left')
del c_cv_recent; gc.collect()
customers['c_cv_recent'] = customers['c_cv_recent'].fillna(0).astype('int32')
customers['c_cv_recent'].head()

0    1
1    0
2    0
3    0
4    0
Name: c_cv_recent, dtype: int32

In [104]:
# customerごとの平均sales_channel_id, key: customer_id
c_mean_sales_channel_id = joined_feature.groupby('customer_id')['sales_channel_id'].agg('mean').reset_index(name='c_mean_sales_channel_id')
customers = pd.merge(customers, c_mean_sales_channel_id, on='customer_id', how='left')
customers['c_mean_sales_channel_id'] = customers['c_mean_sales_channel_id'].fillna(0).astype('float32')                
del c_mean_sales_channel_id; gc.collect()
customers['c_mean_sales_channel_id'].head()

0    1.571429
1    1.941860
2    2.000000
3    2.000000
4    1.846154
Name: c_mean_sales_channel_id, dtype: float32

### dynamic article features1

In [107]:
# articleごとの平均購入価格, key: article_id
a_mean_price = joined_feature.groupby('article_id')['price'].agg('mean').reset_index(name='a_mean_price')
articles = pd.merge(articles, a_mean_price, on='article_id', how='left')
articles['a_mean_price'] = articles['a_mean_price'].fillna(0).astype('float32')                
del a_mean_price; gc.collect()
articles['a_mean_price'].head()

0    0.008142
1    0.008114
2    0.004980
3    0.020219
4    0.018205
Name: a_mean_price, dtype: float32

In [110]:
# articleごとの合計cv数, key: article_id
a_cv_total = joined_feature.groupby('article_id').size().reset_index(name='a_cv_total')
articles = pd.merge(articles, a_cv_total, on='article_id', how='left')
del a_cv_total; gc.collect()
articles['a_cv_total'] = articles['a_cv_total'].fillna(0).astype('int32')
articles['a_cv_total'].head()

0    10841
1     7244
2      215
3     1044
4      539
Name: a_cv_total, dtype: int32

In [111]:
# 直近1週間、articleごとの合計cv数, key: article_id
a_cv_recent = joined_recent.groupby('article_id').size().reset_index(name='a_cv_recent')
articles = pd.merge(articles, a_cv_recent, on='article_id', how='left')
del a_cv_recent; gc.collect()
articles['a_cv_recent'] = articles['a_cv_recent'].fillna(0).astype('int32')                
articles['a_cv_recent'].head()

0    0
1    4
2    0
3    0
4    0
Name: a_cv_recent, dtype: int32

In [113]:
# articleごとの平均sales_channel_id, key: article_id
a_mean_sales_channel_id = joined_feature.groupby('article_id')['sales_channel_id'].agg('mean').reset_index(name='a_mean_sales_channel_id')
articles = pd.merge(articles, a_mean_sales_channel_id, on='article_id', how='left')
articles['a_mean_sales_channel_id'] = articles['a_mean_sales_channel_id'].fillna(0).astype('float32')                
del a_mean_sales_channel_id; gc.collect()
articles['a_mean_sales_channel_id'].head()

0    1.770778
1    1.709967
2    1.995349
3    1.375479
4    1.654917
Name: a_mean_sales_channel_id, dtype: float32

## Join features1 and Save

In [114]:
del joined_feature, joined_recent; gc.collect()

0

In [115]:
joined_train = pd.merge(train_group, customers, on='customer_id', how='left')
del train_group, customers; gc.collect()

0

In [116]:
joined_train = pd.merge(joined_train, articles, on='article_id', how='left')
del articles; gc.collect()

0

In [117]:
joined_train.to_csv(f'train_{train_start_date}.csv', index=False)
del joined_train; gc.collect()

0