### Generate Training data
- 学習データ作成用notebook
- 素性も同時に生成する


In [1]:
import numpy as np
import pandas as pd
import gc
import os
import time
import random
from tqdm.auto import tqdm
import datetime

In [2]:
def visualize_df(df):
    print(df.shape)
    display(df.head())

## Parameters

In [3]:
Nsub = 40
#train_start_date = '2020-09-16' # labelに使うデータ開始日
train_start_date = '2020-09-09' 
full_flag = False # 基本False

In [4]:
# 外部から回す場合はここでパラメータ上書き

In [5]:
from datetime import datetime, date, timedelta
train_start_date_dt = pd.to_datetime(train_start_date) 
train_end_date_dt = train_start_date_dt + timedelta(days=7)
feature_date_dt = train_start_date_dt + timedelta(days=-7)
feature_date2_dt = train_start_date_dt + timedelta(days=-14)

In [6]:
train_end_date = str(train_end_date_dt.strftime('%Y-%m-%d'))
feature_date = str(feature_date_dt.strftime('%Y-%m-%d'))
feature_date2 = str(feature_date2_dt.strftime('%Y-%m-%d'))

In [7]:
print(train_start_date, train_end_date, feature_date, feature_date2)

2020-09-09 2020-09-16 2020-09-02 2020-08-26


## Read Data

In [8]:
# データの読み込み
dir = 'h-and-m-personalized-fashion-recommendations/'
path = '../input/' + dir 

transactions_train = pd.read_csv(path + 'transactions_train.csv')

In [9]:
# save memory https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
# 64文字もあるが、末尾16文字だけでcustomerを一意にid可能
# 使われている文字は0-9, a-fなので16進数で変換してintで扱える
# 16進数 = 4bit, 4bit * 16 / 8 = 8 byte
# 64 byte -> 8 byteに削減, int64で表現可能
transactions_train['customer_id'] = transactions_train['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')

In [10]:
# 提出の形式に合わせてarticle_idの最初に0を加える
#transactions_train['article_id'] = transactions_train['article_id'].map(lambda x: '0' + str(x))
# メモリ削減、復元するには上記の式
transactions_train['article_id'] = transactions_train['article_id'].astype('int32')

In [11]:
transactions_train['t_dat'] = pd.to_datetime(transactions_train['t_dat'])
transactions_train.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,-6846340800584936,663713001,0.050831,2
1,2018-09-20,-6846340800584936,541518023,0.030492,2
2,2018-09-20,-8334631767138808638,505221004,0.015237,2
3,2018-09-20,-8334631767138808638,685687003,0.016932,2
4,2018-09-20,-8334631767138808638,685687004,0.016932,2


In [12]:
train_data = transactions_train.query(f"'{train_start_date}' <= t_dat and t_dat < '{train_end_date}'").reset_index(drop=True)
visualize_df(train_data)

(255241, 5)


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2020-09-09,-7604547624187760215,399136061,0.08339,2
1,2020-09-09,-7604547624187760215,732842014,0.066712,2
2,2020-09-09,-7604547624187760215,556255001,0.01,2
3,2020-09-09,-7604547624187760215,852219003,0.008322,2
4,2020-09-09,-7604547624187760215,732842021,0.066712,2


In [13]:
feature_data = transactions_train.query(f" t_dat < '{train_start_date}'").reset_index(drop=True) # train feature

In [14]:
del transactions_train; gc.collect()

3

In [15]:
train_data

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2020-09-09,-7604547624187760215,399136061,0.083390,2
1,2020-09-09,-7604547624187760215,732842014,0.066712,2
2,2020-09-09,-7604547624187760215,556255001,0.010000,2
3,2020-09-09,-7604547624187760215,852219003,0.008322,2
4,2020-09-09,-7604547624187760215,732842021,0.066712,2
...,...,...,...,...,...
255236,2020-09-15,-977760742639762210,850917001,0.025407,1
255237,2020-09-15,38700952482392720,853316001,0.008458,1
255238,2020-09-15,38700952482392720,296366006,0.000847,1
255239,2020-09-15,38700952482392720,789769001,0.013542,1


In [16]:
feature_data

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,-6846340800584936,663713001,0.050831,2
1,2018-09-20,-6846340800584936,541518023,0.030492,2
2,2018-09-20,-8334631767138808638,505221004,0.015237,2
3,2018-09-20,-8334631767138808638,685687003,0.016932,2
4,2018-09-20,-8334631767138808638,685687004,0.016932,2
...,...,...,...,...,...
31292767,2020-09-08,4685485978980270934,919786002,0.042356,2
31292768,2020-09-08,795398326275572276,765308002,0.033881,2
31292769,2020-09-08,-8286316756823862684,689365050,0.010017,2
31292770,2020-09-08,-8286316756823862684,884081001,0.012898,2


## Read Matching Phase data

In [17]:
# データの読み込み
match = pd.read_csv(f'../input/submission_{train_start_date}_30.csv')
match['customer_id'] = match['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')
match.head()

Unnamed: 0,customer_id,prediction
0,-4345663252774903357,0695632002 0695632001 0706016001
1,-291594535764411411,0737221004 0737221005 0372860001 0736923006 05...
2,8267039440814068109,0734287002 0734287003 0734287001
3,-1181812018918977698,0865073005 0803324005 0875350002 0865073002 07...
4,-4999317557084312486,0599719024 0734215002 0759901001 0468480025 05...


In [18]:
match.size

2743960

In [19]:
# 購入がないユーザーを入れるかどうか
# GBDTの場合はいらなが、分類タスクの場合は正負の割合が変わってしまうのでTrueにする
if not full_flag:
    tmp = pd.Series(train_data['customer_id'].unique(), name='customer_id')
    match = pd.merge(match, tmp, on='customer_id', how='inner')

In [20]:
match['prediction'] = match['prediction'].fillna(' ')
match.head()

Unnamed: 0,customer_id,prediction
0,-4345663252774903357,0695632002 0695632001 0706016001
1,-291594535764411411,0737221004 0737221005 0372860001 0736923006 05...
2,8267039440814068109,0734287002 0734287003 0734287001
3,-1181812018918977698,0865073005 0803324005 0875350002 0865073002 07...
4,-4999317557084312486,0599719024 0734215002 0759901001 0468480025 05...


In [21]:
match['prediction'] = match['prediction'].str.split(' ')
match['prediction'] = match['prediction'].map(lambda x: [int(i) for i in x if i != ''])
match

Unnamed: 0,customer_id,prediction
0,-4345663252774903357,"[695632002, 695632001, 706016001]"
1,-291594535764411411,"[737221004, 737221005, 372860001, 736923006, 5..."
2,8267039440814068109,"[734287002, 734287003, 734287001]"
3,-1181812018918977698,"[865073005, 803324005, 875350002, 865073002, 7..."
4,-4999317557084312486,"[599719024, 734215002, 759901001, 468480025, 5..."
...,...,...
1371975,3104641584622543304,"[399256003, 446224024, 493995001, 613246004, 3..."
1371976,8327270052008857523,"[680263013, 680265005, 683848001, 702118004, 7..."
1371977,9087355039826620500,"[703199003, 783098001, 728162001, 464297007, 1..."
1371978,-133214323572869577,"[685811002, 578630022, 685811022, 816591010, 8..."


### ランキング csvの読み込み

In [22]:
# 事前に出力しておいたrankingデータの読み込み
#simple_ranking = pd.read_csv(f'../input/simple_ranking_{feature_date}_{Nsub}.csv')
simple_ranking = pd.read_csv(f'../input/ranking_{feature_date}_{Nsub}_ch.csv')

In [23]:
simple_ranking = simple_ranking.rename(columns={'prediction':'ranking'})

In [24]:
simple_ranking['ranking'].apply(len).describe()

count    1371980.0
mean         399.0
std            0.0
min          399.0
25%          399.0
50%          399.0
75%          399.0
max          399.0
Name: ranking, dtype: float64

In [25]:
simple_ranking['ranking'] = simple_ranking['ranking'].str.split(' ')
simple_ranking

Unnamed: 0,customer_id,ranking
0,6883939031699146327,"[915529003, 706016001, 909916001, 751471001, 8..."
1,-7200416642310594310,"[915526001, 919365008, 706016001, 448509014, 7..."
2,-6846340800584936,"[915526001, 919365008, 706016001, 448509014, 7..."
3,-94071612138601410,"[751471043, 896152002, 751471001, 893059005, 8..."
4,-283965518499174310,"[751471043, 896152002, 751471001, 893059005, 8..."
...,...,...
1371975,7551062398649767985,"[915526001, 919365008, 706016001, 448509014, 7..."
1371976,-9141402131989464905,"[915526001, 919365008, 706016001, 448509014, 7..."
1371977,-8286316756823862684,"[915526001, 919365008, 706016001, 448509014, 7..."
1371978,2551401172826382186,"[915526001, 919365008, 706016001, 448509014, 7..."


In [26]:
simple_ranking['ranking'].apply(len).describe()

count    1371980.0
mean          40.0
std            0.0
min           40.0
25%           40.0
50%           40.0
75%           40.0
max           40.0
Name: ranking, dtype: float64

### ランキングのデータをjoin

In [27]:
match = pd.merge(match, simple_ranking, on='customer_id', how='left')
match.head()

Unnamed: 0,customer_id,prediction,ranking
0,-4345663252774903357,"[695632002, 695632001, 706016001]","[915529003, 706016001, 909916001, 751471001, 8..."
1,-291594535764411411,"[737221004, 737221005, 372860001, 736923006, 5...","[751471043, 762846031, 884319003, 896152002, 8..."
2,8267039440814068109,"[734287002, 734287003, 734287001]","[916468001, 915526001, 372860001, 918292001, 7..."
3,-1181812018918977698,"[865073005, 803324005, 875350002, 865073002, 7...","[915526001, 919365008, 706016001, 448509014, 7..."
4,-4999317557084312486,"[599719024, 734215002, 759901001, 468480025, 5...","[915526001, 919365008, 706016001, 448509014, 7..."


In [28]:
Nmax = 25
match['match_len'] = match['prediction'].apply(len)
match['match_len'] = match['match_len'].astype('int8')

match.loc[match['match_len'] > Nmax, ['match_len']] = Nmax

In [29]:
match['ranking'] = match.apply(lambda x:  [i for i in x['ranking'] if i not in x['prediction']], axis = 1)
match['ranking'].apply(len).describe()

count    1371980.0
mean          40.0
std            0.0
min           40.0
25%           40.0
50%           40.0
75%           40.0
max           40.0
Name: ranking, dtype: float64

In [30]:
#match['rec_list'] = match.apply(lambda x: (x['prediction']+x['ranking'])[:Nsub], axis = 1)
match['rec_list'] = match.apply(lambda x: (x['prediction'][:Nmax]+x['ranking'])[:Nsub], axis = 1)
match = match.drop(['prediction', 'ranking'], axis='columns')
match['match_rank'] = match.apply(lambda x: [i for i in range(Nsub)], axis = 1)

In [31]:
match_customer = match['customer_id']
match.head()

Unnamed: 0,customer_id,match_len,rec_list,match_rank
0,-4345663252774903357,3,"[695632002, 695632001, 706016001, 915529003, 7...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
1,-291594535764411411,6,"[737221004, 737221005, 372860001, 736923006, 5...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
2,8267039440814068109,3,"[734287002, 734287003, 734287001, 916468001, 9...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
3,-1181812018918977698,8,"[865073005, 803324005, 875350002, 865073002, 7...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
4,-4999317557084312486,12,"[599719024, 734215002, 759901001, 468480025, 5...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."


In [32]:
match = match.explode(['rec_list','match_rank'])
match = match.rename(columns={'rec_list':'article_id'})
match['article_id'] = match['article_id'].astype('int32')
match['match_rank'] = match['match_rank'].astype('int8')
match['label'] = 0
match

Unnamed: 0,customer_id,match_len,article_id,match_rank,label
0,-4345663252774903357,3,695632002,0,0
0,-4345663252774903357,3,695632001,1,0
0,-4345663252774903357,3,706016001,2,0
0,-4345663252774903357,3,915529003,3,0
0,-4345663252774903357,3,706016001,4,0
...,...,...,...,...,...
1371979,-2661682485847144637,0,781613006,35,0
1371979,-2661682485847144637,0,759814009,36,0
1371979,-2661682485847144637,0,850917001,37,0
1371979,-2661682485847144637,0,158340001,38,0


## Add ALS as a feature

In [33]:
# ALSのデータも素性用に追加する
als = pd.read_csv(f'../input/als_{train_start_date}.csv')
als['customer_id'] = als['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')
als = pd.merge(als, match_customer, on='customer_id', how='inner')
als['prediction'] = als['prediction'].str.split(' ')
als['prediction'] = als['prediction'].map(lambda x: [int(i) for i in x if i != ''])
als['als_rank'] = als.apply(lambda x: [i for i in range(30)], axis = 1)
als

Unnamed: 0,customer_id,prediction,als_rank
0,6883939031699146327,"[770315007, 770315017, 568601043, 716672001, 7...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
1,-7200416642310594310,"[187949032, 187949031, 187949030, 187949029, 1...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
2,-6846340800584936,"[187949032, 187949031, 187949030, 187949029, 1...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
3,-94071612138601410,"[187949032, 187949031, 187949030, 187949029, 1...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
4,-283965518499174310,"[896152002, 730683050, 791587015, 868195004, 7...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
...,...,...,...
1371975,7551062398649767985,"[557599022, 822344001, 822344010, 822311001, 8...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
1371976,-9141402131989464905,"[762846031, 762846027, 762846028, 568601043, 7...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
1371977,-8286316756823862684,"[778064028, 636323001, 832458002, 717490010, 7...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
1371978,2551401172826382186,"[187949032, 187949031, 187949030, 187949029, 1...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."


In [34]:
als = als.explode(['prediction','als_rank'])
als = als.rename(columns={'prediction':'article_id'})
als['article_id'] = als['article_id'].astype('int32')
als['als_rank'] = als['als_rank'].astype('int8')
als.head()

Unnamed: 0,customer_id,article_id,als_rank
0,6883939031699146327,770315007,0
0,6883939031699146327,770315017,1
0,6883939031699146327,568601043,2
0,6883939031699146327,716672001,3
0,6883939031699146327,770315005,4


In [35]:
match = pd.merge(match, als, on=['customer_id', 'article_id'], how='left')
match['als_rank'] = match['als_rank'].fillna(99)
match['als_rank'] = match['als_rank'].astype('int8')

## Add labels

In [36]:
unique_train = train_data[['customer_id','article_id']].drop_duplicates()

In [37]:
unique_train['label'] = 1
unique_train['label'] = unique_train['label'].astype('int32')

In [38]:
match = pd.merge(match, unique_train, on=['customer_id', 'article_id'], how='left')

In [39]:
match['label_y'] = match['label_y'].fillna(0)
match['label'] = match['label_x'] + match['label_y']

In [40]:
match

Unnamed: 0,customer_id,match_len,article_id,match_rank,label_x,als_rank,label_y,label
0,-4345663252774903357,3,695632002,0,0,99,0.0,0.0
1,-4345663252774903357,3,695632001,1,0,99,0.0,0.0
2,-4345663252774903357,3,706016001,2,0,99,0.0,0.0
3,-4345663252774903357,3,915529003,3,0,99,0.0,0.0
4,-4345663252774903357,3,706016001,4,0,99,0.0,0.0
...,...,...,...,...,...,...,...,...
54879195,-2661682485847144637,0,781613006,35,0,99,0.0,0.0
54879196,-2661682485847144637,0,759814009,36,0,99,0.0,0.0
54879197,-2661682485847144637,0,850917001,37,0,99,0.0,0.0
54879198,-2661682485847144637,0,158340001,38,0,99,0.0,0.0


In [41]:
match = match.drop(['label_x', 'label_y'], axis='columns')
match['label'] = match['label'].astype('bool')
match.head()

Unnamed: 0,customer_id,match_len,article_id,match_rank,als_rank,label
0,-4345663252774903357,3,695632002,0,99,False
1,-4345663252774903357,3,695632001,1,99,False
2,-4345663252774903357,3,706016001,2,99,False
3,-4345663252774903357,3,915529003,3,99,False
4,-4345663252774903357,3,706016001,4,99,False


In [42]:
# check label dist.
match['label'].value_counts()

False    54861671
True        17529
Name: label, dtype: int64

In [43]:
match.groupby('customer_id')['label'].agg('mean')

customer_id
-9223352921020755230    0.0
-9223343869995384291    0.0
-9223321797620987725    0.0
-9223319430705797669    0.0
-9223308614576639426    0.0
                       ... 
 9223319300843860958    0.0
 9223333063893176977    0.0
 9223345314868180224    0.0
 9223357421094039679    0.0
 9223370729206611574    0.0
Name: label, Length: 1371980, dtype: float64

In [44]:
match.groupby('match_rank')['label'].agg('mean')

match_rank
0     0.001318
1     0.000991
2     0.000719
3     0.000617
4     0.000507
5     0.000451
6     0.000415
7     0.000363
8     0.000349
9     0.000327
10    0.000303
11    0.000300
12    0.000278
13    0.000272
14    0.000260
15    0.000249
16    0.000243
17    0.000233
18    0.000224
19    0.000227
20    0.000233
21    0.000213
22    0.000234
23    0.000233
24    0.000241
25    0.000216
26    0.000190
27    0.000235
28    0.000232
29    0.000189
30    0.000204
31    0.000211
32    0.000198
33    0.000194
34    0.000202
35    0.000157
36    0.000186
37    0.000189
38    0.000176
39    0.000197
Name: label, dtype: float64

In [45]:
match.groupby('match_rank')['label'].agg('mean').sum() # matcingの精度の目安

0.012776425312322337

In [46]:
match['match_len'].mean()

8.302085307365996

In [47]:
train_group = match
del match, unique_train, simple_ranking; gc.collect()

0

In [48]:
train_group

Unnamed: 0,customer_id,match_len,article_id,match_rank,als_rank,label
0,-4345663252774903357,3,695632002,0,99,False
1,-4345663252774903357,3,695632001,1,99,False
2,-4345663252774903357,3,706016001,2,99,False
3,-4345663252774903357,3,915529003,3,99,False
4,-4345663252774903357,3,706016001,4,99,False
...,...,...,...,...,...,...
54879195,-2661682485847144637,0,781613006,35,99,False
54879196,-2661682485847144637,0,759814009,36,99,False
54879197,-2661682485847144637,0,850917001,37,99,False
54879198,-2661682485847144637,0,158340001,38,99,False


## Customer features

- customer_idごとに素性を作成

In [49]:
customers = pd.read_csv(path + 'customers.csv')
customers

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...
...,...,...,...,...,...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,,,ACTIVE,NONE,24.0,7aa399f7e669990daba2d92c577b52237380662f36480b...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,,,ACTIVE,NONE,21.0,3f47f1279beb72215f4de557d950e0bfa73789d24acb5e...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,1.0,1.0,ACTIVE,Regularly,21.0,4563fc79215672cd6a863f2b4bf56b8f898f2d96ed590e...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,1.0,1.0,ACTIVE,Regularly,18.0,8892c18e9bc3dca6aa4000cb8094fc4b51ee8db2ed14d7...


In [50]:
print(customers.memory_usage())
print(customers.dtypes)

Index                          128
customer_id               10975840
FN                        10975840
Active                    10975840
club_member_status        10975840
fashion_news_frequency    10975840
age                       10975840
postal_code               10975840
dtype: int64
customer_id                object
FN                        float64
Active                    float64
club_member_status         object
fashion_news_frequency     object
age                       float64
postal_code                object
dtype: object


In [51]:
customers['customer_id'] = customers['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')
customers['age'] = customers['age'].fillna(0).astype('int8')
customers['FN'] = customers['FN'].fillna(0).astype('bool')
customers['Active'] = customers['Active'].fillna(0).astype('bool')
customers['club_member_status'] = customers['club_member_status'].fillna('NA').astype('category').cat.codes
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].fillna('NA').astype('category').cat.codes
# 長いのでuniqueになるようcategory型にした後に番号をふる
customers['postal_code'] = customers['postal_code'].fillna('NA').astype('category').cat.codes
#customers['postal_code'] = customers['postal_code'].fillna('NA').astype('category')

In [52]:
print(customers.memory_usage())
print(customers.dtypes)

Index                          128
customer_id               10975840
FN                         1371980
Active                     1371980
club_member_status         1371980
fashion_news_frequency     1371980
age                        1371980
postal_code                5487920
dtype: int64
customer_id               int64
FN                         bool
Active                     bool
club_member_status         int8
fashion_news_frequency     int8
age                        int8
postal_code               int32
dtype: object


In [53]:
print(customers['club_member_status'].value_counts())
print(customers['club_member_status'].value_counts())
print(customers['fashion_news_frequency'].value_counts())
print(customers['postal_code'].value_counts)

0    1272491
3      92960
2       6062
1        467
Name: club_member_status, dtype: int64
0    1272491
3      92960
2       6062
1        467
Name: club_member_status, dtype: int64
2    877711
4    477416
1     16009
0       842
3         2
Name: fashion_news_frequency, dtype: int64
<bound method IndexOpsMixin.value_counts of 0          112978
1           57312
2          139156
3          128529
4           52371
            ...  
1371975    169171
1371976     87255
1371977     95707
1371978    188279
1371979     13927
Name: postal_code, Length: 1371980, dtype: int32>


In [54]:
customers

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,6883939031699146327,False,False,0,2,49,112978
1,-7200416642310594310,False,False,0,2,25,57312
2,-6846340800584936,False,False,0,2,24,139156
3,-94071612138601410,False,False,0,2,54,128529
4,-283965518499174310,True,True,0,4,52,52371
...,...,...,...,...,...,...,...
1371975,7551062398649767985,False,False,0,2,24,169171
1371976,-9141402131989464905,False,False,0,2,21,87255
1371977,-8286316756823862684,True,True,0,4,21,95707
1371978,2551401172826382186,True,True,0,4,18,188279


## Article features

In [55]:
articles = pd.read_csv(path + 'articles.csv')
articles.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [56]:
print(articles.memory_usage())
print(articles.dtypes)

Index                              128
article_id                      844336
product_code                    844336
prod_name                       844336
product_type_no                 844336
product_type_name               844336
product_group_name              844336
graphical_appearance_no         844336
graphical_appearance_name       844336
colour_group_code               844336
colour_group_name               844336
perceived_colour_value_id       844336
perceived_colour_value_name     844336
perceived_colour_master_id      844336
perceived_colour_master_name    844336
department_no                   844336
department_name                 844336
index_code                      844336
index_name                      844336
index_group_no                  844336
index_group_name                844336
section_no                      844336
section_name                    844336
garment_group_no                844336
garment_group_name              844336
detail_desc              

In [57]:
articles['article_id'] = articles['article_id'].astype('int32')

In [58]:
# add gender features
# https://www.kaggle.com/code/lichtlab/h-m-data-deep-dive-chap-1-understand-article
def set_gender_flg(x):
    x['is_for_male'] = 0
    x['is_for_female'] = 0
    x['is_for_mama'] = 0
    if x['index_group_name'] in ['Ladieswear','Divided']:
        x['is_for_female'] = 1
    if x['index_group_name'] == 'Menswear':
        x['is_for_male'] = 1
    if x['index_group_name'] in ['Baby/Children','Sport']:
        if 'boy' in x['department_name'].lower() or 'men' in x['department_name'].lower():
            x['is_for_male'] = 1
        if 'girl' in x['department_name'].lower() or 'ladies' in x['department_name'].lower():
            x['is_for_female'] = 1
    if x['section_name'] == 'Mama':
        x['is_for_mama'] = 1
    return x

In [59]:
articles = articles.apply(set_gender_flg, axis=1)
articles.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc,is_for_male,is_for_female,is_for_mama
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,0,1,0
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,0,1,0
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,0,1,0
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde...",0,1,0
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde...",0,1,0


In [60]:
# idがあるカラムのnameは消す
drop_list = ['product_type_name', 
             'graphical_appearance_name', 
             'colour_group_name', 
             'perceived_colour_value_name', 
             'perceived_colour_master_name',
             'department_name',
             'index_name',
             'index_group_name',
             'section_name',
             'garment_group_name',
             'prod_name',
             'detail_desc'
            ]

articles = articles.drop(drop_list, axis='columns')

In [61]:
articles['product_code']                = articles['product_code'].fillna(0).astype('int32')                
articles['product_type_no']             = articles['product_type_no'].astype('int32')               
articles['graphical_appearance_no']     = articles['graphical_appearance_no'].astype('int32') 
articles['colour_group_code']           = articles['colour_group_code'].astype('int32')   
articles['perceived_colour_value_id']   = articles['perceived_colour_value_id'].astype('int32')     
articles['perceived_colour_master_id']  = articles['perceived_colour_master_id'].astype('int32')    
articles['department_no']               = articles['department_no'].astype('category').cat.codes                 
articles['index_code']                  = articles['index_code'].astype('category').cat.codes                 
articles['index_group_no']              = articles['index_group_no'].astype('int8')                
articles['section_no']                  = articles['section_no'].astype('int8')                    
articles['garment_group_no']            = articles['garment_group_no'].astype('int16')              
articles['product_group_no']            = articles['product_group_name'].fillna('NA').astype('category').cat.codes.astype('int32') # nameからnoを生成
articles['is_for_male']                 = articles['is_for_male'].astype('bool')              
articles['is_for_female']               = articles['is_for_female'].astype('bool')              
articles['is_for_mama']                 = articles['is_for_mama'].astype('bool')              
articles = articles.drop('product_group_name', axis='columns')

In [62]:
articles

Unnamed: 0,article_id,product_code,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no,is_for_male,is_for_female,is_for_mama,product_group_no
0,108775015,108775,253,1010016,9,4,5,47,0,1,16,1002,False,True,False,7
1,108775044,108775,253,1010016,10,3,9,47,0,1,16,1002,False,True,False,7
2,108775051,108775,253,1010017,11,1,9,47,0,1,16,1002,False,True,False,7
3,110065001,110065,306,1010016,9,4,5,12,1,1,61,1017,False,True,False,16
4,110065002,110065,306,1010016,10,3,9,12,1,1,61,1017,False,True,False,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105537,953450001,953450,302,1010014,9,4,5,224,4,3,26,1021,True,False,False,13
105538,953763001,953763,253,1010016,9,4,5,66,0,1,2,1005,False,True,False,7
105539,956217002,956217,265,1010016,9,4,5,34,0,1,18,1005,False,True,False,5
105540,957375001,957375,72,1010016,9,4,5,120,3,2,52,1019,False,True,False,0


## dynamic features

### train

In [63]:
feature_data.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,-6846340800584936,663713001,0.050831,2
1,2018-09-20,-6846340800584936,541518023,0.030492,2
2,2018-09-20,-8334631767138808638,505221004,0.015237,2
3,2018-09-20,-8334631767138808638,685687003,0.016932,2
4,2018-09-20,-8334631767138808638,685687004,0.016932,2


In [64]:
joined_feature = pd.merge(feature_data, customers, on='customer_id', how='left')
joined_feature = pd.merge(joined_feature, articles, on='article_id', how='left')
del feature_data; gc.collect()

0

In [65]:
joined_feature.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,FN,Active,club_member_status,fashion_news_frequency,age,...,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no,is_for_male,is_for_female,is_for_mama,product_group_no
0,2018-09-20,-6846340800584936,663713001,0.050831,2,False,False,0,2,24,...,5,11,1,1,61,1017,False,True,False,16
1,2018-09-20,-6846340800584936,541518023,0.030492,2,False,False,0,2,24,...,4,9,1,1,61,1017,False,True,False,16
2,2018-09-20,-8334631767138808638,505221004,0.015237,2,True,True,0,4,32,...,4,203,3,2,58,1003,False,True,False,7
3,2018-09-20,-8334631767138808638,685687003,0.016932,2,True,True,0,4,32,...,4,87,0,1,15,1023,False,True,False,7
4,2018-09-20,-8334631767138808638,685687004,0.016932,2,True,True,0,4,32,...,19,87,0,1,15,1023,False,True,False,7


In [66]:
tmp = joined_feature.groupby('customer_id').t_dat.max().reset_index()
tmp = tmp.rename(columns={'t_dat':'max_dat'})
tmp['diff_dat_last_buy'] = (train_start_date_dt - tmp['max_dat']).dt.days
tmp.columns = ['customer_id','max_dat', 'diff_dat_last_buy']
joined_feature = joined_feature.merge(tmp,on=['customer_id'],how='left')
#del tmp; gc.collect()

In [67]:
joined_feature['diff_dat'] = (joined_feature['max_dat'] - joined_feature['t_dat']).dt.days
joined_feature = joined_feature.drop('max_dat', axis='columns')
joined_last_week = joined_feature.loc[joined_feature['diff_dat']<=6]

In [68]:
joined_feature.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,FN,Active,club_member_status,fashion_news_frequency,age,...,index_code,index_group_no,section_no,garment_group_no,is_for_male,is_for_female,is_for_mama,product_group_no,diff_dat_last_buy,diff_dat
0,2018-09-20,-6846340800584936,663713001,0.050831,2,False,False,0,2,24,...,1,1,61,1017,False,True,False,16,144,576
1,2018-09-20,-6846340800584936,541518023,0.030492,2,False,False,0,2,24,...,1,1,61,1017,False,True,False,16,144,576
2,2018-09-20,-8334631767138808638,505221004,0.015237,2,True,True,0,4,32,...,3,2,58,1003,False,True,False,7,119,601
3,2018-09-20,-8334631767138808638,685687003,0.016932,2,True,True,0,4,32,...,0,1,15,1023,False,True,False,7,119,601
4,2018-09-20,-8334631767138808638,685687004,0.016932,2,True,True,0,4,32,...,0,1,15,1023,False,True,False,7,119,601


In [69]:
joined_last_week.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,FN,Active,club_member_status,fashion_news_frequency,age,...,index_code,index_group_no,section_no,garment_group_no,is_for_male,is_for_female,is_for_mama,product_group_no,diff_dat_last_buy,diff_dat
49,2018-09-20,-5667465470176998279,649356002,0.027441,1,True,True,0,4,41,...,6,4,46,1003,True,False,False,7,720,0
50,2018-09-20,-5667465470176998279,579941002,0.019814,1,True,True,0,4,41,...,5,4,41,1006,True,False,False,6,720,0
51,2018-09-20,-5667465470176998279,629760002,0.015237,1,True,True,0,4,41,...,5,4,41,1006,True,False,False,6,720,0
52,2018-09-20,-5667465470176998279,625229004,0.019814,1,True,True,0,4,41,...,5,4,41,1005,True,False,False,6,720,0
61,2018-09-20,-232048505547517783,613456009,0.016932,2,True,True,0,4,28,...,0,1,15,1005,False,True,False,7,720,0


In [70]:
joined_recent = joined_feature.query(f"'{feature_date}' <= t_dat").reset_index(drop=True) # 1 week ago
joined_recent2 = joined_feature.query(f"'{feature_date2}' <= t_dat and t_dat < '{feature_date}'").reset_index(drop=True) # 2 weeks ago

### dynamic customer features

In [71]:
# customerごとの平均購入価格, key: customer_id
c_mean_price = joined_feature.groupby('customer_id')['price'].agg('mean').reset_index(name='c_mean_price')
customers = pd.merge(customers, c_mean_price, on='customer_id', how='left')
customers['c_mean_price'] = customers['c_mean_price'].fillna(0).astype('float16')                
del c_mean_price; gc.collect()
customers['c_mean_price'].head()

0    0.030899
1    0.030258
2    0.037872
3    0.030487
4    0.036133
Name: c_mean_price, dtype: float16

In [72]:
# customerごとの購入価格の標準偏差, key: customer_id
c_std_price = joined_feature.groupby('customer_id')['price'].agg('std').reset_index(name='c_std_price')
customers = pd.merge(customers, c_std_price, on='customer_id', how='left')
customers['c_std_price'] = customers['c_std_price'].fillna(0).astype('float16')                
del c_std_price; gc.collect()
customers['c_std_price'].head()

0    0.015717
1    0.016953
2    0.016449
3    0.000000
4    0.012634
Name: c_std_price, dtype: float16

In [73]:
# customerごとの合計cv数, key: customer_id
c_cv_total = joined_feature.groupby('customer_id').size().reset_index(name='c_cv_total')
customers = pd.merge(customers, c_cv_total, on='customer_id', how='left')
del c_cv_total; gc.collect()
customers['c_cv_total'] = customers['c_cv_total'].fillna(0).astype('int32')
customers['c_cv_total'].head()

0    21
1    86
2    17
3     2
4    13
Name: c_cv_total, dtype: int32

In [74]:
# 直近1週間、customerごとの合計cv数, key: customer_id
c_cv_recent = joined_recent.groupby('customer_id').size().reset_index(name='c_cv_recent')
customers = pd.merge(customers, c_cv_recent, on='customer_id', how='left')
del c_cv_recent; gc.collect()
customers['c_cv_recent'] = customers['c_cv_recent'].fillna(0).astype('int32')
customers['c_cv_recent'].head()

0    1
1    0
2    0
3    0
4    0
Name: c_cv_recent, dtype: int32

In [75]:
# customerごとの平均sales_channel_id, key: customer_id
c_mean_sales_channel_id = joined_feature.groupby('customer_id')['sales_channel_id'].agg('mean').reset_index(name='c_mean_sales_channel_id')
customers = pd.merge(customers, c_mean_sales_channel_id, on='customer_id', how='left')
customers['c_mean_sales_channel_id'] = customers['c_mean_sales_channel_id'].fillna(0).astype('float16')                
del c_mean_sales_channel_id; gc.collect()
customers['c_mean_sales_channel_id'].head()

0    1.571289
1    1.941406
2    2.000000
3    2.000000
4    1.845703
Name: c_mean_sales_channel_id, dtype: float16

In [76]:
# customerごとの平均is_for_male, key: customer_id
c_mean_is_for_male = joined_feature.groupby('customer_id')['is_for_male'].agg('mean').reset_index(name='c_mean_is_for_male')
customers = pd.merge(customers, c_mean_is_for_male, on='customer_id', how='left')
customers['c_mean_is_for_male'] = customers['c_mean_is_for_male'].fillna(0).astype('float16')                
del c_mean_is_for_male; gc.collect()
customers['c_mean_is_for_male'].head()

0    0.142822
1    0.011627
2    0.235352
3    0.000000
4    0.000000
Name: c_mean_is_for_male, dtype: float16

In [77]:
# customerごとの平均is_for_female, key: customer_id
c_mean_is_for_female = joined_feature.groupby('customer_id')['is_for_female'].agg('mean').reset_index(name='c_mean_is_for_female')
customers = pd.merge(customers, c_mean_is_for_female, on='customer_id', how='left')
customers['c_mean_is_for_female'] = customers['c_mean_is_for_female'].fillna(0).astype('float16')                
del c_mean_is_for_female; gc.collect()
customers['c_mean_is_for_female'].head()

0    0.856934
1    0.976562
2    0.764648
3    1.000000
4    1.000000
Name: c_mean_is_for_female, dtype: float16

In [78]:
# customerごとの平均is_for_mama, key: customer_id
c_mean_is_for_mama = joined_feature.groupby('customer_id')['is_for_mama'].agg('mean').reset_index(name='c_mean_is_for_mama')
customers = pd.merge(customers, c_mean_is_for_mama, on='customer_id', how='left')
customers['c_mean_is_for_mama'] = customers['c_mean_is_for_mama'].fillna(0).astype('float16')                
del c_mean_is_for_mama; gc.collect()
customers['c_mean_is_for_mama'].head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: c_mean_is_for_mama, dtype: float16

In [79]:
# customerごとの最後の購入が何日前か, key: customer_id
tmp = tmp[['customer_id','diff_dat_last_buy']] # 以前計算したものを再利用
customers = pd.merge(customers, tmp, on='customer_id', how='left')
customers['diff_dat_last_buy'] = customers['diff_dat_last_buy'].fillna(0).astype('int16')                
del tmp; gc.collect()
customers['diff_dat_last_buy'].head()

0      4
1     63
2    144
3    458
4     28
Name: diff_dat_last_buy, dtype: int16

### dynamic article features

In [80]:
# articleごとの平均購入価格, key: article_id
a_mean_price = joined_feature.groupby('article_id')['price'].agg('mean').reset_index(name='a_mean_price')
articles = pd.merge(articles, a_mean_price, on='article_id', how='left')
articles['a_mean_price'] = articles['a_mean_price'].fillna(0).astype('float32')                
del a_mean_price; gc.collect()
articles['a_mean_price'].head()

0    0.008142
1    0.008114
2    0.004980
3    0.020219
4    0.018205
Name: a_mean_price, dtype: float32

In [81]:
# articleごとの合計cv数, key: article_id
a_cv_total = joined_feature.groupby('article_id').size().reset_index(name='a_cv_total')
articles = pd.merge(articles, a_cv_total, on='article_id', how='left')
del a_cv_total; gc.collect()
articles['a_cv_total'] = articles['a_cv_total'].fillna(0).astype('int32')
articles['a_cv_total'].head()

0    10841
1     7244
2      215
3     1044
4      539
Name: a_cv_total, dtype: int32

In [82]:
# 直近1週間、articleごとの合計cv数, key: article_id
a_cv_recent = joined_recent.groupby('article_id').size().reset_index(name='a_cv_recent')
articles = pd.merge(articles, a_cv_recent, on='article_id', how='left')
del a_cv_recent; gc.collect()
articles['a_cv_recent'] = articles['a_cv_recent'].fillna(0).astype('int32')                
articles['a_cv_recent'].head()

0    0
1    4
2    0
3    0
4    0
Name: a_cv_recent, dtype: int32

In [83]:
# 直近1週間、articleごとの合計cv数, online
a_cv_recent_on = joined_recent.query('sales_channel_id == 2').groupby('article_id').size().reset_index(name='a_cv_recent_on')
articles = pd.merge(articles, a_cv_recent_on, on='article_id', how='left')
del a_cv_recent_on; gc.collect()
articles['a_cv_recent_on'] = articles['a_cv_recent_on'].fillna(0).astype('int32')                
articles['a_cv_recent_on'].head()

0    0
1    4
2    0
3    0
4    0
Name: a_cv_recent_on, dtype: int32

In [84]:
# 直近1週間、articleごとの合計cv数, offline
a_cv_recent_off = joined_recent.query('sales_channel_id == 1').groupby('article_id').size().reset_index(name='a_cv_recent_off')
articles = pd.merge(articles, a_cv_recent_off, on='article_id', how='left')
del a_cv_recent_off; gc.collect()
articles['a_cv_recent_off'] = articles['a_cv_recent_off'].fillna(0).astype('int32')                
articles['a_cv_recent_off'].head()

0    0
1    0
2    0
3    0
4    0
Name: a_cv_recent_off, dtype: int32

In [85]:
# 2 weeks ago、articleごとの合計cv数, key: article_id
a_cv_recent2 = joined_recent2.groupby('article_id').size().reset_index(name='a_cv_recent2')
articles = pd.merge(articles, a_cv_recent2, on='article_id', how='left')
del a_cv_recent2; gc.collect()
articles['a_cv_recent2'] = articles['a_cv_recent2'].fillna(0).astype('int32')                
articles['a_cv_recent2'].head()

0    0
1    8
2    0
3    0
4    0
Name: a_cv_recent2, dtype: int32

In [86]:
# 直近1週間、articleごとの合計cv数ranking, key: article_id
articles['a_cv_recent_ranking'] = articles['a_cv_recent'].rank(method='min', ascending=False).astype('int16')
articles.head()

Unnamed: 0,article_id,product_code,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,...,is_for_female,is_for_mama,product_group_no,a_mean_price,a_cv_total,a_cv_recent,a_cv_recent_on,a_cv_recent_off,a_cv_recent2,a_cv_recent_ranking
0,108775015,108775,253,1010016,9,4,5,47,0,1,...,True,False,7,0.008142,10841,0,0,0,0,19363
1,108775044,108775,253,1010016,10,3,9,47,0,1,...,True,False,7,0.008114,7244,4,4,0,8,8260
2,108775051,108775,253,1010017,11,1,9,47,0,1,...,True,False,7,0.00498,215,0,0,0,0,19363
3,110065001,110065,306,1010016,9,4,5,12,1,1,...,True,False,16,0.020219,1044,0,0,0,0,19363
4,110065002,110065,306,1010016,10,3,9,12,1,1,...,True,False,16,0.018205,539,0,0,0,0,19363


In [87]:
# 直近1週間、articleごとの合計cv数ranking, key: article_id
articles['a_cv_recent_ranking2'] = articles['a_cv_recent2'].rank(method='min', ascending=False).astype('int16')
articles.head()

Unnamed: 0,article_id,product_code,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,...,is_for_mama,product_group_no,a_mean_price,a_cv_total,a_cv_recent,a_cv_recent_on,a_cv_recent_off,a_cv_recent2,a_cv_recent_ranking,a_cv_recent_ranking2
0,108775015,108775,253,1010016,9,4,5,47,0,1,...,False,7,0.008142,10841,0,0,0,0,19363,19615
1,108775044,108775,253,1010016,10,3,9,47,0,1,...,False,7,0.008114,7244,4,4,0,8,8260,5879
2,108775051,108775,253,1010017,11,1,9,47,0,1,...,False,7,0.00498,215,0,0,0,0,19363,19615
3,110065001,110065,306,1010016,9,4,5,12,1,1,...,False,16,0.020219,1044,0,0,0,0,19363,19615
4,110065002,110065,306,1010016,10,3,9,12,1,1,...,False,16,0.018205,539,0,0,0,0,19363,19615


In [88]:
# 2週前と1週前を比べたcvの比
articles['a_cv_recent_ratio'] = (articles['a_cv_recent'] + 0.01) / (articles['a_cv_recent2'] + 0.01)
articles['a_cv_recent_ratio'] = articles['a_cv_recent_ratio'].astype('float16')                
articles[['a_cv_recent', 'a_cv_recent2','a_cv_recent_ratio']].head()

Unnamed: 0,a_cv_recent,a_cv_recent2,a_cv_recent_ratio
0,0,0,1.0
1,4,8,0.500488
2,0,0,1.0
3,0,0,1.0
4,0,0,1.0


In [89]:
# articleごとの平均sales_channel_id, key: article_id
a_mean_sales_channel_id = joined_feature.groupby('article_id')['sales_channel_id'].agg('mean').reset_index(name='a_mean_sales_channel_id')
articles = pd.merge(articles, a_mean_sales_channel_id, on='article_id', how='left')
articles['a_mean_sales_channel_id'] = articles['a_mean_sales_channel_id'].fillna(0).astype('float16')                
del a_mean_sales_channel_id; gc.collect()
articles['a_mean_sales_channel_id'].head()

0    1.770508
1    1.709961
2    1.995117
3    1.375000
4    1.655273
Name: a_mean_sales_channel_id, dtype: float16

In [90]:
# articleごとの平均age, key: article_id
a_mean_age = joined_feature.groupby('article_id')['age'].agg('mean').reset_index(name='a_mean_age')
articles = pd.merge(articles, a_mean_age, on='article_id', how='left')
articles['a_mean_age'] = articles['a_mean_age'].fillna(0).astype('int8')                
del a_mean_age; gc.collect()
articles['a_mean_age'].head()

0    34
1    35
2    35
3    37
4    39
Name: a_mean_age, dtype: int8

In [91]:
# articleごとのage標準偏差, key: article_id
a_std_age = joined_feature.groupby('article_id')['age'].agg('std').reset_index(name='a_std_age')
articles = pd.merge(articles, a_std_age, on='article_id', how='left')
articles['a_std_age'] = articles['a_std_age'].fillna(0).astype('float16')                
del a_std_age; gc.collect()
articles['a_std_age'].head()

0    11.968750
1    12.812500
2    12.351562
3    12.289062
4    12.796875
Name: a_std_age, dtype: float16

### join train

In [92]:
joined_train = pd.merge(train_group, customers, on='customer_id', how='left')
del train_group, customers; gc.collect()

0

In [93]:
joined_train = pd.merge(joined_train, articles, on='article_id', how='left')
del articles; gc.collect()

0

In [94]:
joined_train.head()

Unnamed: 0,customer_id,match_len,article_id,match_rank,als_rank,label,FN,Active,club_member_status,fashion_news_frequency,...,a_cv_recent,a_cv_recent_on,a_cv_recent_off,a_cv_recent2,a_cv_recent_ranking,a_cv_recent_ranking2,a_cv_recent_ratio,a_mean_sales_channel_id,a_mean_age,a_std_age
0,-4345663252774903357,3,695632002,0,99,False,False,False,0,2,...,49,3,46,49,1177,1235,1.0,1.679688,36,14.304688
1,-4345663252774903357,3,695632001,1,99,False,False,False,0,2,...,59,2,57,64,961,914,0.921875,1.583984,37,13.96875
2,-4345663252774903357,3,706016001,2,99,False,False,False,0,2,...,683,619,64,701,4,6,0.974121,1.845703,32,12.46875
3,-4345663252774903357,3,915529003,3,99,False,False,False,0,2,...,606,518,88,750,6,5,0.808105,1.80957,35,12.945312
4,-4345663252774903357,3,706016001,4,99,False,False,False,0,2,...,683,619,64,701,4,6,0.974121,1.845703,32,12.46875


### dymanic interactive features

In [95]:
# customer, index_group_no (article素性)ごとの購入数
tmp_df = joined_feature.groupby(['customer_id', 'index_group_no']).size().reset_index(name='ca_index_group_no_num')
joined_train = pd.merge(joined_train, tmp_df, on=['customer_id', 'index_group_no'], how='left')
joined_train['ca_index_group_no_num'] = joined_train['ca_index_group_no_num'].fillna(0).astype('int32')                

# customerの全購入数で規格化、similarityみたいな量
joined_train['index_group_no_similarity'] = (joined_train['ca_index_group_no_num']/joined_train['c_cv_total']).astype('float32')
joined_train.head()

Unnamed: 0,customer_id,match_len,article_id,match_rank,als_rank,label,FN,Active,club_member_status,fashion_news_frequency,...,a_cv_recent_off,a_cv_recent2,a_cv_recent_ranking,a_cv_recent_ranking2,a_cv_recent_ratio,a_mean_sales_channel_id,a_mean_age,a_std_age,ca_index_group_no_num,index_group_no_similarity
0,-4345663252774903357,3,695632002,0,99,False,False,False,0,2,...,46,49,1177,1235,1.0,1.679688,36,14.304688,10,1.0
1,-4345663252774903357,3,695632001,1,99,False,False,False,0,2,...,57,64,961,914,0.921875,1.583984,37,13.96875,10,1.0
2,-4345663252774903357,3,706016001,2,99,False,False,False,0,2,...,64,701,4,6,0.974121,1.845703,32,12.46875,0,0.0
3,-4345663252774903357,3,915529003,3,99,False,False,False,0,2,...,88,750,6,5,0.808105,1.80957,35,12.945312,10,1.0
4,-4345663252774903357,3,706016001,4,99,False,False,False,0,2,...,64,701,4,6,0.974121,1.845703,32,12.46875,0,0.0


In [96]:
# customer, index_code (article素性)ごとの購入数
tmp_df = joined_feature.groupby(['customer_id', 'index_code']).size().reset_index(name='ca_index_code_num')
joined_train = pd.merge(joined_train, tmp_df, on=['customer_id', 'index_code'], how='left')
joined_train['ca_index_code_num'] = joined_train['ca_index_code_num'].fillna(0).astype('int32')                

# customerの全購入数で規格化、similalityみたいな量
joined_train['index_code_similarity'] = (joined_train['ca_index_code_num']/joined_train['c_cv_total']).fillna(0).astype('float32')
joined_train.head()

Unnamed: 0,customer_id,match_len,article_id,match_rank,als_rank,label,FN,Active,club_member_status,fashion_news_frequency,...,a_cv_recent_ranking,a_cv_recent_ranking2,a_cv_recent_ratio,a_mean_sales_channel_id,a_mean_age,a_std_age,ca_index_group_no_num,index_group_no_similarity,ca_index_code_num,index_code_similarity
0,-4345663252774903357,3,695632002,0,99,False,False,False,0,2,...,1177,1235,1.0,1.679688,36,14.304688,10,1.0,7,0.7
1,-4345663252774903357,3,695632001,1,99,False,False,False,0,2,...,961,914,0.921875,1.583984,37,13.96875,10,1.0,7,0.7
2,-4345663252774903357,3,706016001,2,99,False,False,False,0,2,...,4,6,0.974121,1.845703,32,12.46875,0,0.0,0,0.0
3,-4345663252774903357,3,915529003,3,99,False,False,False,0,2,...,6,5,0.808105,1.80957,35,12.945312,10,1.0,7,0.7
4,-4345663252774903357,3,706016001,4,99,False,False,False,0,2,...,4,6,0.974121,1.845703,32,12.46875,0,0.0,0,0.0


In [97]:
# customer, product_gropu_no (article素性)ごとの購入数
tmp_df = joined_feature.groupby(['customer_id', 'product_group_no']).size().reset_index(name='ca_product_group_no_num')
joined_train = pd.merge(joined_train, tmp_df, on=['customer_id', 'product_group_no'], how='left')
joined_train['ca_product_group_no_num'] = joined_train['ca_product_group_no_num'].fillna(0).astype('int32')                

# customerの全購入数で規格化、similalityみたいな量
joined_train['product_group_no_similarity'] = (joined_train['ca_product_group_no_num']/joined_train['c_cv_total']).astype('float32')
joined_train.head()

Unnamed: 0,customer_id,match_len,article_id,match_rank,als_rank,label,FN,Active,club_member_status,fashion_news_frequency,...,a_cv_recent_ratio,a_mean_sales_channel_id,a_mean_age,a_std_age,ca_index_group_no_num,index_group_no_similarity,ca_index_code_num,index_code_similarity,ca_product_group_no_num,product_group_no_similarity
0,-4345663252774903357,3,695632002,0,99,False,False,False,0,2,...,1.0,1.679688,36,14.304688,10,1.0,7,0.7,4,0.4
1,-4345663252774903357,3,695632001,1,99,False,False,False,0,2,...,0.921875,1.583984,37,13.96875,10,1.0,7,0.7,4,0.4
2,-4345663252774903357,3,706016001,2,99,False,False,False,0,2,...,0.974121,1.845703,32,12.46875,0,0.0,0,0.0,0,0.0
3,-4345663252774903357,3,915529003,3,99,False,False,False,0,2,...,0.808105,1.80957,35,12.945312,10,1.0,7,0.7,4,0.4
4,-4345663252774903357,3,706016001,4,99,False,False,False,0,2,...,0.974121,1.845703,32,12.46875,0,0.0,0,0.0,0,0.0


In [98]:
# 同じ商品を何回購入したことがあるか
tmp_df = joined_feature.groupby(['customer_id', 'article_id']).size().reset_index(name='buy_same_before')
tmp_df['buy_same_before'].fillna(0).astype('int16')
joined_train = pd.merge(joined_train, tmp_df, on=['customer_id', 'article_id'], how='left')
joined_train['buy_same_before'] = joined_train['buy_same_before'].fillna(0).astype('int16')                
joined_train.head()

Unnamed: 0,customer_id,match_len,article_id,match_rank,als_rank,label,FN,Active,club_member_status,fashion_news_frequency,...,a_mean_sales_channel_id,a_mean_age,a_std_age,ca_index_group_no_num,index_group_no_similarity,ca_index_code_num,index_code_similarity,ca_product_group_no_num,product_group_no_similarity,buy_same_before
0,-4345663252774903357,3,695632002,0,99,False,False,False,0,2,...,1.679688,36,14.304688,10,1.0,7,0.7,4,0.4,2
1,-4345663252774903357,3,695632001,1,99,False,False,False,0,2,...,1.583984,37,13.96875,10,1.0,7,0.7,4,0.4,1
2,-4345663252774903357,3,706016001,2,99,False,False,False,0,2,...,1.845703,32,12.46875,0,0.0,0,0.0,0,0.0,0
3,-4345663252774903357,3,915529003,3,99,False,False,False,0,2,...,1.80957,35,12.945312,10,1.0,7,0.7,4,0.4,0
4,-4345663252774903357,3,706016001,4,99,False,False,False,0,2,...,1.845703,32,12.46875,0,0.0,0,0.0,0,0.0,0


In [99]:
# 直近1週間で同じ商品を直近何回購入したことがあるか
tmp_df = joined_recent.groupby(['customer_id', 'article_id']).size().reset_index(name='buy_same_before_recent')
joined_train = pd.merge(joined_train, tmp_df, on=['customer_id', 'article_id'], how='left')
joined_train['buy_same_before_recent'] = joined_train['buy_same_before_recent'].fillna(0).astype('int16')                
joined_train.head()

Unnamed: 0,customer_id,match_len,article_id,match_rank,als_rank,label,FN,Active,club_member_status,fashion_news_frequency,...,a_mean_age,a_std_age,ca_index_group_no_num,index_group_no_similarity,ca_index_code_num,index_code_similarity,ca_product_group_no_num,product_group_no_similarity,buy_same_before,buy_same_before_recent
0,-4345663252774903357,3,695632002,0,99,False,False,False,0,2,...,36,14.304688,10,1.0,7,0.7,4,0.4,2,0
1,-4345663252774903357,3,695632001,1,99,False,False,False,0,2,...,37,13.96875,10,1.0,7,0.7,4,0.4,1,0
2,-4345663252774903357,3,706016001,2,99,False,False,False,0,2,...,32,12.46875,0,0.0,0,0.0,0,0.0,0,0
3,-4345663252774903357,3,915529003,3,99,False,False,False,0,2,...,35,12.945312,10,1.0,7,0.7,4,0.4,0,0
4,-4345663252774903357,3,706016001,4,99,False,False,False,0,2,...,32,12.46875,0,0.0,0,0.0,0,0.0,0,0


In [100]:
# 最後の購入から7日以内に同じ商品を直近何回購入したことがあるか
tmp_df = joined_last_week.groupby(['customer_id', 'article_id']).size().reset_index(name='buy_same_last_week')
joined_train = pd.merge(joined_train, tmp_df, on=['customer_id', 'article_id'], how='left')
joined_train['buy_same_last_week'] = joined_train['buy_same_last_week'].fillna(0).astype('int16')                
joined_train.head()

Unnamed: 0,customer_id,match_len,article_id,match_rank,als_rank,label,FN,Active,club_member_status,fashion_news_frequency,...,a_std_age,ca_index_group_no_num,index_group_no_similarity,ca_index_code_num,index_code_similarity,ca_product_group_no_num,product_group_no_similarity,buy_same_before,buy_same_before_recent,buy_same_last_week
0,-4345663252774903357,3,695632002,0,99,False,False,False,0,2,...,14.304688,10,1.0,7,0.7,4,0.4,2,0,1
1,-4345663252774903357,3,695632001,1,99,False,False,False,0,2,...,13.96875,10,1.0,7,0.7,4,0.4,1,0,0
2,-4345663252774903357,3,706016001,2,99,False,False,False,0,2,...,12.46875,0,0.0,0,0.0,0,0.0,0,0,0
3,-4345663252774903357,3,915529003,3,99,False,False,False,0,2,...,12.945312,10,1.0,7,0.7,4,0.4,0,0,0
4,-4345663252774903357,3,706016001,4,99,False,False,False,0,2,...,12.46875,0,0.0,0,0.0,0,0.0,0,0,0


In [101]:
del tmp_df; gc.collect()

0

In [102]:
# delta_price
joined_train['delta_mean_price'] = (joined_train['c_mean_price'] - joined_train['a_mean_price']).astype('float16')
joined_train.head()

Unnamed: 0,customer_id,match_len,article_id,match_rank,als_rank,label,FN,Active,club_member_status,fashion_news_frequency,...,ca_index_group_no_num,index_group_no_similarity,ca_index_code_num,index_code_similarity,ca_product_group_no_num,product_group_no_similarity,buy_same_before,buy_same_before_recent,buy_same_last_week,delta_mean_price
0,-4345663252774903357,3,695632002,0,99,False,False,False,0,2,...,10,1.0,7,0.7,4,0.4,2,0,1,0.011795
1,-4345663252774903357,3,695632001,1,99,False,False,False,0,2,...,10,1.0,7,0.7,4,0.4,1,0,0,0.010323
2,-4345663252774903357,3,706016001,2,99,False,False,False,0,2,...,0,0.0,0,0.0,0,0.0,0,0,0,0.009926
3,-4345663252774903357,3,915529003,3,99,False,False,False,0,2,...,10,1.0,7,0.7,4,0.4,0,0,0,0.009056
4,-4345663252774903357,3,706016001,4,99,False,False,False,0,2,...,0,0.0,0,0.0,0,0.0,0,0,0,0.009926


In [103]:
# delta_age
joined_train['delta_mean_age'] = (joined_train['age'] - joined_train['a_mean_age']).astype('int8')
joined_train.head()

Unnamed: 0,customer_id,match_len,article_id,match_rank,als_rank,label,FN,Active,club_member_status,fashion_news_frequency,...,index_group_no_similarity,ca_index_code_num,index_code_similarity,ca_product_group_no_num,product_group_no_similarity,buy_same_before,buy_same_before_recent,buy_same_last_week,delta_mean_price,delta_mean_age
0,-4345663252774903357,3,695632002,0,99,False,False,False,0,2,...,1.0,7,0.7,4,0.4,2,0,1,0.011795,7
1,-4345663252774903357,3,695632001,1,99,False,False,False,0,2,...,1.0,7,0.7,4,0.4,1,0,0,0.010323,6
2,-4345663252774903357,3,706016001,2,99,False,False,False,0,2,...,0.0,0,0.0,0,0.0,0,0,0,0.009926,11
3,-4345663252774903357,3,915529003,3,99,False,False,False,0,2,...,1.0,7,0.7,4,0.4,0,0,0,0.009056,8
4,-4345663252774903357,3,706016001,4,99,False,False,False,0,2,...,0.0,0,0.0,0,0.0,0,0,0,0.009926,11


In [104]:
# delta_mean_sales_channel_id
joined_train['delta_mean_sales_channel_id'] = (joined_train['c_mean_sales_channel_id'] - joined_train['a_mean_sales_channel_id']).astype('float16')
joined_train.head()

Unnamed: 0,customer_id,match_len,article_id,match_rank,als_rank,label,FN,Active,club_member_status,fashion_news_frequency,...,ca_index_code_num,index_code_similarity,ca_product_group_no_num,product_group_no_similarity,buy_same_before,buy_same_before_recent,buy_same_last_week,delta_mean_price,delta_mean_age,delta_mean_sales_channel_id
0,-4345663252774903357,3,695632002,0,99,False,False,False,0,2,...,7,0.7,4,0.4,2,0,1,0.011795,7,-0.179688
1,-4345663252774903357,3,695632001,1,99,False,False,False,0,2,...,7,0.7,4,0.4,1,0,0,0.010323,6,-0.083984
2,-4345663252774903357,3,706016001,2,99,False,False,False,0,2,...,0,0.0,0,0.0,0,0,0,0.009926,11,-0.345703
3,-4345663252774903357,3,915529003,3,99,False,False,False,0,2,...,7,0.7,4,0.4,0,0,0,0.009056,8,-0.30957
4,-4345663252774903357,3,706016001,4,99,False,False,False,0,2,...,0,0.0,0,0.0,0,0,0,0.009926,11,-0.345703


In [105]:
# delta_mean_is_for_male
joined_train['delta_mean_is_for_male'] = (joined_train['c_mean_is_for_male'] - joined_train['is_for_male']).astype('float16')
joined_train.head()

Unnamed: 0,customer_id,match_len,article_id,match_rank,als_rank,label,FN,Active,club_member_status,fashion_news_frequency,...,index_code_similarity,ca_product_group_no_num,product_group_no_similarity,buy_same_before,buy_same_before_recent,buy_same_last_week,delta_mean_price,delta_mean_age,delta_mean_sales_channel_id,delta_mean_is_for_male
0,-4345663252774903357,3,695632002,0,99,False,False,False,0,2,...,0.7,4,0.4,2,0,1,0.011795,7,-0.179688,0.0
1,-4345663252774903357,3,695632001,1,99,False,False,False,0,2,...,0.7,4,0.4,1,0,0,0.010323,6,-0.083984,0.0
2,-4345663252774903357,3,706016001,2,99,False,False,False,0,2,...,0.0,0,0.0,0,0,0,0.009926,11,-0.345703,0.0
3,-4345663252774903357,3,915529003,3,99,False,False,False,0,2,...,0.7,4,0.4,0,0,0,0.009056,8,-0.30957,0.0
4,-4345663252774903357,3,706016001,4,99,False,False,False,0,2,...,0.0,0,0.0,0,0,0,0.009926,11,-0.345703,0.0


In [106]:
# delta_mean_is_for_male
joined_train['delta_mean_is_for_female'] = (joined_train['c_mean_is_for_female'] - joined_train['is_for_female']).astype('float16')
joined_train.head()

Unnamed: 0,customer_id,match_len,article_id,match_rank,als_rank,label,FN,Active,club_member_status,fashion_news_frequency,...,ca_product_group_no_num,product_group_no_similarity,buy_same_before,buy_same_before_recent,buy_same_last_week,delta_mean_price,delta_mean_age,delta_mean_sales_channel_id,delta_mean_is_for_male,delta_mean_is_for_female
0,-4345663252774903357,3,695632002,0,99,False,False,False,0,2,...,4,0.4,2,0,1,0.011795,7,-0.179688,0.0,0.0
1,-4345663252774903357,3,695632001,1,99,False,False,False,0,2,...,4,0.4,1,0,0,0.010323,6,-0.083984,0.0,0.0
2,-4345663252774903357,3,706016001,2,99,False,False,False,0,2,...,0,0.0,0,0,0,0.009926,11,-0.345703,0.0,0.0
3,-4345663252774903357,3,915529003,3,99,False,False,False,0,2,...,4,0.4,0,0,0,0.009056,8,-0.30957,0.0,0.0
4,-4345663252774903357,3,706016001,4,99,False,False,False,0,2,...,0,0.0,0,0,0,0.009926,11,-0.345703,0.0,0.0


In [107]:
# delta_mean_is_for_mama
joined_train['delta_mean_is_for_mama'] = (joined_train['c_mean_is_for_mama'] - joined_train['is_for_mama']).astype('float16')
joined_train.head()

Unnamed: 0,customer_id,match_len,article_id,match_rank,als_rank,label,FN,Active,club_member_status,fashion_news_frequency,...,product_group_no_similarity,buy_same_before,buy_same_before_recent,buy_same_last_week,delta_mean_price,delta_mean_age,delta_mean_sales_channel_id,delta_mean_is_for_male,delta_mean_is_for_female,delta_mean_is_for_mama
0,-4345663252774903357,3,695632002,0,99,False,False,False,0,2,...,0.4,2,0,1,0.011795,7,-0.179688,0.0,0.0,0.0
1,-4345663252774903357,3,695632001,1,99,False,False,False,0,2,...,0.4,1,0,0,0.010323,6,-0.083984,0.0,0.0,0.0
2,-4345663252774903357,3,706016001,2,99,False,False,False,0,2,...,0.0,0,0,0,0.009926,11,-0.345703,0.0,0.0,0.0
3,-4345663252774903357,3,915529003,3,99,False,False,False,0,2,...,0.4,0,0,0,0.009056,8,-0.30957,0.0,0.0,0.0
4,-4345663252774903357,3,706016001,4,99,False,False,False,0,2,...,0.0,0,0,0,0.009926,11,-0.345703,0.0,0.0,0.0


In [108]:
#del joined_feature, joined_recent; gc.collect()
del joined_recent, joined_last_week; gc.collect()

0

### additional feature

In [109]:
import numpy as np
pairs = np.load(f'../input/pairs_np_{train_start_date}_2.pkl', allow_pickle=True)
#pairs = np.load(f'./pairs_np_2020-09-09_2.pkl', allow_pickle=True)

In [110]:
joined_feature = joined_feature[['t_dat', 'customer_id', 'article_id']]

In [111]:
pairs_df = pd.DataFrame(list(pairs.items()), columns=['article_id', 'article_id_list'])
pairs_df['article_id_rank1'] = pairs_df['article_id_list'].map(lambda x: x[0])
pairs_df['article_id_rank2'] = pairs_df['article_id_list'].map(lambda x: x[1])
pairs_df = pairs_df.drop('article_id_list', axis='columns')

In [112]:
pairs_df.head()

Unnamed: 0,article_id,article_id_rank1,article_id_rank2
0,706016001,706016002,706016003
1,706016002,706016001,706016003
2,372860001,372860002,706016001
3,610776002,610776001,706016001
4,759871002,759871001,706016001


In [113]:
joined_feature = pd.merge(joined_feature, pairs_df, on='article_id', how='left')

In [114]:
del pairs, pairs_df; gc.collect()

0

In [115]:
# 共起1位の商品を何回購入したことがあるか
tmp_df = joined_feature.groupby(['customer_id', 'article_id_rank1']).size().reset_index(name='buy_rank1_before')
tmp_df['buy_rank1_before'].fillna(0).astype('int16')
tmp_df = tmp_df.rename(columns={'article_id_rank1':'article_id'})

joined_train = pd.merge(joined_train, tmp_df, on=['customer_id', 'article_id'], how='left')
joined_train['buy_rank1_before'] = joined_train['buy_rank1_before'].fillna(0).astype('int16')                
joined_train.head()

Unnamed: 0,customer_id,match_len,article_id,match_rank,als_rank,label,FN,Active,club_member_status,fashion_news_frequency,...,buy_same_before,buy_same_before_recent,buy_same_last_week,delta_mean_price,delta_mean_age,delta_mean_sales_channel_id,delta_mean_is_for_male,delta_mean_is_for_female,delta_mean_is_for_mama,buy_rank1_before
0,-4345663252774903357,3,695632002,0,99,False,False,False,0,2,...,2,0,1,0.011795,7,-0.179688,0.0,0.0,0.0,1
1,-4345663252774903357,3,695632001,1,99,False,False,False,0,2,...,1,0,0,0.010323,6,-0.083984,0.0,0.0,0.0,3
2,-4345663252774903357,3,706016001,2,99,False,False,False,0,2,...,0,0,0,0.009926,11,-0.345703,0.0,0.0,0.0,2
3,-4345663252774903357,3,915529003,3,99,False,False,False,0,2,...,0,0,0,0.009056,8,-0.30957,0.0,0.0,0.0,0
4,-4345663252774903357,3,706016001,4,99,False,False,False,0,2,...,0,0,0,0.009926,11,-0.345703,0.0,0.0,0.0,2


In [116]:
# 共起2位の商品を何回購入したことがあるか
tmp_df = joined_feature.groupby(['customer_id', 'article_id_rank2']).size().reset_index(name='buy_rank2_before')
tmp_df['buy_rank2_before'].fillna(0).astype('int16')
tmp_df = tmp_df.rename(columns={'article_id_rank2':'article_id'})

joined_train = pd.merge(joined_train, tmp_df, on=['customer_id', 'article_id'], how='left')
joined_train['buy_rank2_before'] = joined_train['buy_rank2_before'].fillna(0).astype('int16')                
joined_train.head()

Unnamed: 0,customer_id,match_len,article_id,match_rank,als_rank,label,FN,Active,club_member_status,fashion_news_frequency,...,buy_same_before_recent,buy_same_last_week,delta_mean_price,delta_mean_age,delta_mean_sales_channel_id,delta_mean_is_for_male,delta_mean_is_for_female,delta_mean_is_for_mama,buy_rank1_before,buy_rank2_before
0,-4345663252774903357,3,695632002,0,99,False,False,False,0,2,...,0,1,0.011795,7,-0.179688,0.0,0.0,0.0,1,1
1,-4345663252774903357,3,695632001,1,99,False,False,False,0,2,...,0,0,0.010323,6,-0.083984,0.0,0.0,0.0,3,0
2,-4345663252774903357,3,706016001,2,99,False,False,False,0,2,...,0,0,0.009926,11,-0.345703,0.0,0.0,0.0,2,3
3,-4345663252774903357,3,915529003,3,99,False,False,False,0,2,...,0,0,0.009056,8,-0.30957,0.0,0.0,0.0,0,0
4,-4345663252774903357,3,706016001,4,99,False,False,False,0,2,...,0,0,0.009926,11,-0.345703,0.0,0.0,0.0,2,3


In [117]:
# 共起1位の商品を何回したのは最後の購入から何日前か
tmp_df = joined_feature.groupby(['customer_id', 'article_id_rank1']).t_dat.max().reset_index()
tmp_df = tmp_df.rename(columns={'t_dat':'max_dat'})
tmp_df = tmp_df.rename(columns={'article_id_rank1':'article_id'})
tmp_df['diff_dat_last_buy_rank1'] = (train_start_date_dt - tmp_df['max_dat']).dt.days
tmp_df = tmp_df.drop('max_dat', axis='columns')
joined_train = pd.merge(joined_train, tmp_df, on=['customer_id', 'article_id'], how='left')
joined_train['diff_dat_last_buy_rank1'] = joined_train['diff_dat_last_buy_rank1'].fillna(999).astype('int16')                
joined_train.head()

Unnamed: 0,customer_id,match_len,article_id,match_rank,als_rank,label,FN,Active,club_member_status,fashion_news_frequency,...,buy_same_last_week,delta_mean_price,delta_mean_age,delta_mean_sales_channel_id,delta_mean_is_for_male,delta_mean_is_for_female,delta_mean_is_for_mama,buy_rank1_before,buy_rank2_before,diff_dat_last_buy_rank1
0,-4345663252774903357,3,695632002,0,99,False,False,False,0,2,...,1,0.011795,7,-0.179688,0.0,0.0,0.0,1,1,321
1,-4345663252774903357,3,695632001,1,99,False,False,False,0,2,...,0,0.010323,6,-0.083984,0.0,0.0,0.0,3,0,47
2,-4345663252774903357,3,706016001,2,99,False,False,False,0,2,...,0,0.009926,11,-0.345703,0.0,0.0,0.0,2,3,316
3,-4345663252774903357,3,915529003,3,99,False,False,False,0,2,...,0,0.009056,8,-0.30957,0.0,0.0,0.0,0,0,999
4,-4345663252774903357,3,706016001,4,99,False,False,False,0,2,...,0,0.009926,11,-0.345703,0.0,0.0,0.0,2,3,316


In [None]:
# 共起2位の商品を何回したのは最後の購入から何日前か
tmp_df = joined_feature.groupby(['customer_id', 'article_id_rank2']).t_dat.max().reset_index()
tmp_df = tmp_df.rename(columns={'t_dat':'max_dat'})
tmp_df = tmp_df.rename(columns={'article_id_rank2':'article_id'})
tmp_df['diff_dat_last_buy_rank2'] = (train_start_date_dt - tmp_df['max_dat']).dt.days
tmp_df = tmp_df.drop('max_dat', axis='columns')
joined_train = pd.merge(joined_train, tmp_df, on=['customer_id', 'article_id'], how='left')
joined_train['diff_dat_last_buy_rank2'] = joined_train['diff_dat_last_buy_rank2'].fillna(999).astype('int16')                
joined_train.head()

In [None]:
del tmp_df, joined_feature; gc.collect()

## Save

In [None]:
if full_flag:
    joined_train.to_csv(f'train_{train_start_date}_full.csv', index=False)
else:
    joined_train.to_csv(f'train_{train_start_date}.csv', index=False)
    
del joined_train; gc.collect()