### Generate Training data
- 学習データ作成用notebook
- 素性も同時に生成する


In [1]:
import numpy as np
import pandas as pd
import gc
import os
import time
import random
from tqdm.auto import tqdm
import datetime

In [2]:
def visualize_df(df):
    print(df.shape)
    display(df.head())

## Parameters

In [3]:
Nsub = 40
#train_start_date = '2020-09-16' # labelに使うデータ開始日
train_start_date = '2020-09-09' 
full_flag = True # 分類タスクの場合のみ

In [4]:
# 外部から回す場合はここでパラメータ上書き

In [5]:
from datetime import datetime, date, timedelta
train_start_date_dt = pd.to_datetime(train_start_date) 
train_end_date_dt = train_start_date_dt + timedelta(days=7)
feature_date_dt = train_start_date_dt + timedelta(days=-7)
feature_date2_dt = train_start_date_dt + timedelta(days=-14)

In [6]:
train_end_date = str(train_end_date_dt.strftime('%Y-%m-%d'))
feature_date = str(feature_date_dt.strftime('%Y-%m-%d'))
feature_date2 = str(feature_date2_dt.strftime('%Y-%m-%d'))

In [7]:
print(train_start_date, train_end_date, feature_date, feature_date2)

2020-09-09 2020-09-16 2020-09-02 2020-08-26


## Read Data

In [8]:
# データの読み込み
dir = 'h-and-m-personalized-fashion-recommendations/'
path = '../input/' + dir 

transactions_train = pd.read_csv(path + 'transactions_train.csv')

In [9]:
# save memory https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
# 64文字もあるが、末尾16文字だけでcustomerを一意にid可能
# 使われている文字は0-9, a-fなので16進数で変換してintで扱える
# 16進数 = 4bit, 4bit * 16 / 8 = 8 byte
# 64 byte -> 8 byteに削減, int64で表現可能
transactions_train['customer_id'] = transactions_train['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')

In [10]:
# 提出の形式に合わせてarticle_idの最初に0を加える
#transactions_train['article_id'] = transactions_train['article_id'].map(lambda x: '0' + str(x))
# メモリ削減、復元するには上記の式
transactions_train['article_id'] = transactions_train['article_id'].astype('int32')

In [11]:
transactions_train['t_dat'] = pd.to_datetime(transactions_train['t_dat'])
transactions_train.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,-6846340800584936,663713001,0.050831,2
1,2018-09-20,-6846340800584936,541518023,0.030492,2
2,2018-09-20,-8334631767138808638,505221004,0.015237,2
3,2018-09-20,-8334631767138808638,685687003,0.016932,2
4,2018-09-20,-8334631767138808638,685687004,0.016932,2


In [12]:
train_data = transactions_train.query(f"'{train_start_date}' <= t_dat and t_dat < '{train_end_date}'").reset_index(drop=True)
visualize_df(train_data)

(255241, 5)


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2020-09-09,-7604547624187760215,399136061,0.08339,2
1,2020-09-09,-7604547624187760215,732842014,0.066712,2
2,2020-09-09,-7604547624187760215,556255001,0.01,2
3,2020-09-09,-7604547624187760215,852219003,0.008322,2
4,2020-09-09,-7604547624187760215,732842021,0.066712,2


In [13]:
feature_data = transactions_train.query(f" t_dat < '{train_start_date}'").reset_index(drop=True) # train feature

In [14]:
del transactions_train; gc.collect()

3

In [15]:
train_data

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2020-09-09,-7604547624187760215,399136061,0.083390,2
1,2020-09-09,-7604547624187760215,732842014,0.066712,2
2,2020-09-09,-7604547624187760215,556255001,0.010000,2
3,2020-09-09,-7604547624187760215,852219003,0.008322,2
4,2020-09-09,-7604547624187760215,732842021,0.066712,2
...,...,...,...,...,...
255236,2020-09-15,-977760742639762210,850917001,0.025407,1
255237,2020-09-15,38700952482392720,853316001,0.008458,1
255238,2020-09-15,38700952482392720,296366006,0.000847,1
255239,2020-09-15,38700952482392720,789769001,0.013542,1


In [16]:
feature_data

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,-6846340800584936,663713001,0.050831,2
1,2018-09-20,-6846340800584936,541518023,0.030492,2
2,2018-09-20,-8334631767138808638,505221004,0.015237,2
3,2018-09-20,-8334631767138808638,685687003,0.016932,2
4,2018-09-20,-8334631767138808638,685687004,0.016932,2
...,...,...,...,...,...
31292767,2020-09-08,4685485978980270934,919786002,0.042356,2
31292768,2020-09-08,795398326275572276,765308002,0.033881,2
31292769,2020-09-08,-8286316756823862684,689365050,0.010017,2
31292770,2020-09-08,-8286316756823862684,884081001,0.012898,2


## Read Matching Phase data

In [17]:
# データの読み込み
match = pd.read_csv(f'../input/submission_{train_start_date}_30.csv')
match['customer_id'] = match['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')
match.head()

Unnamed: 0,customer_id,prediction
0,-4345663252774903357,0695632002 0695632001 0706016001
1,-291594535764411411,0737221004 0737221005 0372860001 0736923006 05...
2,8267039440814068109,0734287002 0734287003 0734287001
3,-1181812018918977698,0865073005 0803324005 0875350002 0865073002 07...
4,-4999317557084312486,0599719024 0734215002 0759901001 0468480025 05...


In [18]:
match.size

2743960

In [19]:
# 購入がないユーザーを入れるかどうか
# GBDTの場合はいらなが、分類タスクの場合は正負の割合が変わってしまうのでTrueにする
if not full_flag:
    tmp = pd.Series(train_data['customer_id'].unique(), name='customer_id')
    match = pd.merge(match, tmp, on='customer_id', how='inner')

In [20]:
match['prediction'] = match['prediction'].fillna(' ')
match.head()

Unnamed: 0,customer_id,prediction
0,-4345663252774903357,0695632002 0695632001 0706016001
1,-291594535764411411,0737221004 0737221005 0372860001 0736923006 05...
2,8267039440814068109,0734287002 0734287003 0734287001
3,-1181812018918977698,0865073005 0803324005 0875350002 0865073002 07...
4,-4999317557084312486,0599719024 0734215002 0759901001 0468480025 05...


In [21]:
match['prediction'] = match['prediction'].str.split(' ')
match['prediction'] = match['prediction'].map(lambda x: [int(i) for i in x if i != ''])
match

Unnamed: 0,customer_id,prediction
0,-4345663252774903357,"[695632002, 695632001, 706016001]"
1,-291594535764411411,"[737221004, 737221005, 372860001, 736923006, 5..."
2,8267039440814068109,"[734287002, 734287003, 734287001]"
3,-1181812018918977698,"[865073005, 803324005, 875350002, 865073002, 7..."
4,-4999317557084312486,"[599719024, 734215002, 759901001, 468480025, 5..."
...,...,...
1371975,3104641584622543304,"[399256003, 446224024, 493995001, 613246004, 3..."
1371976,8327270052008857523,"[680263013, 680265005, 683848001, 702118004, 7..."
1371977,9087355039826620500,"[703199003, 783098001, 728162001, 464297007, 1..."
1371978,-133214323572869577,"[685811002, 578630022, 685811022, 816591010, 8..."


### ランキング csvの読み込み

In [22]:
# 事前に出力しておいたrankingデータの読み込み
#simple_ranking = pd.read_csv(f'../input/simple_ranking_{feature_date}_{Nsub}.csv')
simple_ranking = pd.read_csv(f'../input/ranking_{feature_date}_{Nsub}_ch.csv')

In [23]:
simple_ranking = simple_ranking.rename(columns={'prediction':'ranking'})

In [24]:
simple_ranking['ranking'].apply(len).describe()

count    1371980.0
mean         399.0
std            0.0
min          399.0
25%          399.0
50%          399.0
75%          399.0
max          399.0
Name: ranking, dtype: float64

In [25]:
simple_ranking['ranking'] = simple_ranking['ranking'].str.split(' ')
simple_ranking

Unnamed: 0,customer_id,ranking
0,6883939031699146327,"[915529003, 706016001, 909916001, 751471001, 8..."
1,-7200416642310594310,"[915526001, 919365008, 706016001, 448509014, 7..."
2,-6846340800584936,"[915526001, 919365008, 706016001, 448509014, 7..."
3,-94071612138601410,"[751471043, 896152002, 751471001, 893059005, 8..."
4,-283965518499174310,"[751471043, 896152002, 751471001, 893059005, 8..."
...,...,...
1371975,7551062398649767985,"[915526001, 919365008, 706016001, 448509014, 7..."
1371976,-9141402131989464905,"[915526001, 919365008, 706016001, 448509014, 7..."
1371977,-8286316756823862684,"[915526001, 919365008, 706016001, 448509014, 7..."
1371978,2551401172826382186,"[915526001, 919365008, 706016001, 448509014, 7..."


In [26]:
simple_ranking['ranking'].apply(len).describe()

count    1371980.0
mean          40.0
std            0.0
min           40.0
25%           40.0
50%           40.0
75%           40.0
max           40.0
Name: ranking, dtype: float64

### ランキングのデータをjoin

In [27]:
match = pd.merge(match, simple_ranking, on='customer_id', how='left')
match.head()

Unnamed: 0,customer_id,prediction,ranking
0,-4345663252774903357,"[695632002, 695632001, 706016001]","[915529003, 706016001, 909916001, 751471001, 8..."
1,-291594535764411411,"[737221004, 737221005, 372860001, 736923006, 5...","[751471043, 762846031, 884319003, 896152002, 8..."
2,8267039440814068109,"[734287002, 734287003, 734287001]","[916468001, 915526001, 372860001, 918292001, 7..."
3,-1181812018918977698,"[865073005, 803324005, 875350002, 865073002, 7...","[915526001, 919365008, 706016001, 448509014, 7..."
4,-4999317557084312486,"[599719024, 734215002, 759901001, 468480025, 5...","[915526001, 919365008, 706016001, 448509014, 7..."


In [28]:
Nmax = 25
match['match_len'] = match['prediction'].apply(len)
match['match_len'] = match['match_len'].astype('int8')

match.loc[match['match_len'] > Nmax, ['match_len']] = Nmax

In [None]:
match['ranking'] = match.apply(lambda x:  [i for i in x['ranking'] if i not in x['prediction']], axis = 1)
match['ranking'].apply(len).describe()

In [None]:
#match['rec_list'] = match.apply(lambda x: (x['prediction']+x['ranking'])[:Nsub], axis = 1)
match['rec_list'] = match.apply(lambda x: (x['prediction'][:Nmax]+x['ranking'])[:Nsub], axis = 1)
match = match.drop(['prediction', 'ranking'], axis='columns')
match['match_rank'] = match.apply(lambda x: [i for i in range(Nsub)], axis = 1)

In [None]:
match_customer = match['customer_id']
match.head()

In [None]:
match = match.explode(['rec_list','match_rank'])
match = match.rename(columns={'rec_list':'article_id'})
match['article_id'] = match['article_id'].astype('int32')
match['match_rank'] = match['match_rank'].astype('int8')
match['label'] = 0
match

## Add ALS as a feature

In [None]:
# ALSのデータも素性用に追加する
als = pd.read_csv(f'../input/als_{train_start_date}.csv')
als['customer_id'] = als['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')
als = pd.merge(als, match_customer, on='customer_id', how='inner')
als['prediction'] = als['prediction'].str.split(' ')
als['prediction'] = als['prediction'].map(lambda x: [int(i) for i in x if i != ''])
als['als_rank'] = als.apply(lambda x: [i for i in range(30)], axis = 1)
als

In [None]:
als = als.explode(['prediction','als_rank'])
als = als.rename(columns={'prediction':'article_id'})
als['article_id'] = als['article_id'].astype('int32')
als['als_rank'] = als['als_rank'].astype('int8')
als.head()

In [None]:
match = pd.merge(match, als, on=['customer_id', 'article_id'], how='left')
match['als_rank'] = match['als_rank'].fillna(99)
match['als_rank'] = match['als_rank'].astype('int8')

## Add labels

In [None]:
unique_train = train_data[['customer_id','article_id']].drop_duplicates()

In [None]:
unique_train['label'] = 1
unique_train['label'] = unique_train['label'].astype('int32')

In [None]:
match = pd.merge(match, unique_train, on=['customer_id', 'article_id'], how='left')

In [None]:
match['label_y'] = match['label_y'].fillna(0)
match['label'] = match['label_x'] + match['label_y']

In [None]:
match

In [None]:
match = match.drop(['label_x', 'label_y'], axis='columns')
match['label'] = match['label'].astype('bool')
match.head()

In [None]:
# check label dist.
match['label'].value_counts()

In [None]:
match.groupby('customer_id')['label'].agg('mean')

In [None]:
match.groupby('match_rank')['label'].agg('mean')

In [None]:
match.groupby('match_rank')['label'].agg('mean').sum() # matcingの精度の目安

In [None]:
match['match_len'].mean()

In [None]:
train_group = match
del match, unique_train, simple_ranking; gc.collect()

In [None]:
train_group

## Customer features

- customer_idごとに素性を作成

In [None]:
customers = pd.read_csv(path + 'customers.csv')
customers

In [None]:
print(customers.memory_usage())
print(customers.dtypes)

In [None]:
customers['customer_id'] = customers['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')
customers['age'] = customers['age'].fillna(0).astype('int8')
customers['FN'] = customers['FN'].fillna(0).astype('bool')
customers['Active'] = customers['Active'].fillna(0).astype('bool')
customers['club_member_status'] = customers['club_member_status'].fillna('NA').astype('category').cat.codes
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].fillna('NA').astype('category').cat.codes
# 長いのでuniqueになるようcategory型にした後に番号をふる
customers['postal_code'] = customers['postal_code'].fillna('NA').astype('category').cat.codes
#customers['postal_code'] = customers['postal_code'].fillna('NA').astype('category')

In [None]:
print(customers.memory_usage())
print(customers.dtypes)

In [None]:
print(customers['club_member_status'].value_counts())
print(customers['club_member_status'].value_counts())
print(customers['fashion_news_frequency'].value_counts())
print(customers['postal_code'].value_counts)

In [None]:
customers

## Article features

In [None]:
articles = pd.read_csv(path + 'articles.csv')
articles.head()

In [None]:
print(articles.memory_usage())
print(articles.dtypes)

In [None]:
articles['article_id'] = articles['article_id'].astype('int32')

In [None]:
# add gender features
# https://www.kaggle.com/code/lichtlab/h-m-data-deep-dive-chap-1-understand-article
def set_gender_flg(x):
    x['is_for_male'] = 0
    x['is_for_female'] = 0
    x['is_for_mama'] = 0
    if x['index_group_name'] in ['Ladieswear','Divided']:
        x['is_for_female'] = 1
    if x['index_group_name'] == 'Menswear':
        x['is_for_male'] = 1
    if x['index_group_name'] in ['Baby/Children','Sport']:
        if 'boy' in x['department_name'].lower() or 'men' in x['department_name'].lower():
            x['is_for_male'] = 1
        if 'girl' in x['department_name'].lower() or 'ladies' in x['department_name'].lower():
            x['is_for_female'] = 1
    if x['section_name'] == 'Mama':
        x['is_for_mama'] = 1
    return x

In [None]:
articles = articles.apply(set_gender_flg, axis=1)
articles.head()

In [None]:
# idがあるカラムのnameは消す
drop_list = ['product_type_name', 
             'graphical_appearance_name', 
             'colour_group_name', 
             'perceived_colour_value_name', 
             'perceived_colour_master_name',
             'department_name',
             'index_name',
             'index_group_name',
             'section_name',
             'garment_group_name',
             'prod_name',
             'detail_desc'
            ]

articles = articles.drop(drop_list, axis='columns')

In [None]:
articles['product_code']                = articles['product_code'].fillna(0).astype('int32')                
articles['product_type_no']             = articles['product_type_no'].astype('int32')               
articles['graphical_appearance_no']     = articles['graphical_appearance_no'].astype('int32') 
articles['colour_group_code']           = articles['colour_group_code'].astype('int32')   
articles['perceived_colour_value_id']   = articles['perceived_colour_value_id'].astype('int32')     
articles['perceived_colour_master_id']  = articles['perceived_colour_master_id'].astype('int32')    
articles['department_no']               = articles['department_no'].astype('category').cat.codes                 
articles['index_code']                  = articles['index_code'].astype('category').cat.codes                 
articles['index_group_no']              = articles['index_group_no'].astype('int8')                
articles['section_no']                  = articles['section_no'].astype('int8')                    
articles['garment_group_no']            = articles['garment_group_no'].astype('int16')              
articles['product_group_no']            = articles['product_group_name'].fillna('NA').astype('category').cat.codes.astype('int32') # nameからnoを生成
articles['is_for_male']                 = articles['is_for_male'].astype('bool')              
articles['is_for_female']               = articles['is_for_female'].astype('bool')              
articles['is_for_mama']                 = articles['is_for_mama'].astype('bool')              
articles = articles.drop('product_group_name', axis='columns')

In [None]:
articles

## dynamic features

### train

In [None]:
feature_data.head()

In [None]:
joined_feature = pd.merge(feature_data, customers, on='customer_id', how='left')
joined_feature = pd.merge(joined_feature, articles, on='article_id', how='left')
del feature_data; gc.collect()

In [None]:
joined_feature.head()

In [None]:
tmp = joined_feature.groupby('customer_id').t_dat.max().reset_index()
tmp = tmp.rename(columns={'t_dat':'max_dat'})
tmp['diff_dat_last_buy'] = (train_start_date_dt - tmp['max_dat']).dt.days
tmp.columns = ['customer_id','max_dat', 'diff_dat_last_buy']
joined_feature = joined_feature.merge(tmp,on=['customer_id'],how='left')
#del tmp; gc.collect()

In [None]:
joined_feature['diff_dat'] = (joined_feature['max_dat'] - joined_feature['t_dat']).dt.days
joined_feature = joined_feature.drop('max_dat', axis='columns')
joined_last_week = joined_feature.loc[joined_feature['diff_dat']<=6]

In [None]:
joined_feature.head()

In [None]:
joined_last_week.head()

In [None]:
joined_recent = joined_feature.query(f"'{feature_date}' <= t_dat").reset_index(drop=True) # 1 week ago
joined_recent2 = joined_feature.query(f"'{feature_date2}' <= t_dat and t_dat < '{feature_date}'").reset_index(drop=True) # 2 weeks ago

### dynamic customer features

In [None]:
# customerごとの平均購入価格, key: customer_id
c_mean_price = joined_feature.groupby('customer_id')['price'].agg('mean').reset_index(name='c_mean_price')
customers = pd.merge(customers, c_mean_price, on='customer_id', how='left')
customers['c_mean_price'] = customers['c_mean_price'].fillna(0).astype('float16')                
del c_mean_price; gc.collect()
customers['c_mean_price'].head()

In [None]:
# customerごとの購入価格の標準偏差, key: customer_id
c_std_price = joined_feature.groupby('customer_id')['price'].agg('std').reset_index(name='c_std_price')
customers = pd.merge(customers, c_std_price, on='customer_id', how='left')
customers['c_std_price'] = customers['c_std_price'].fillna(0).astype('float16')                
del c_std_price; gc.collect()
customers['c_std_price'].head()

In [None]:
# customerごとの合計cv数, key: customer_id
c_cv_total = joined_feature.groupby('customer_id').size().reset_index(name='c_cv_total')
customers = pd.merge(customers, c_cv_total, on='customer_id', how='left')
del c_cv_total; gc.collect()
customers['c_cv_total'] = customers['c_cv_total'].fillna(0).astype('int32')
customers['c_cv_total'].head()

In [None]:
# 直近1週間、customerごとの合計cv数, key: customer_id
c_cv_recent = joined_recent.groupby('customer_id').size().reset_index(name='c_cv_recent')
customers = pd.merge(customers, c_cv_recent, on='customer_id', how='left')
del c_cv_recent; gc.collect()
customers['c_cv_recent'] = customers['c_cv_recent'].fillna(0).astype('int32')
customers['c_cv_recent'].head()

In [None]:
# customerごとの平均sales_channel_id, key: customer_id
c_mean_sales_channel_id = joined_feature.groupby('customer_id')['sales_channel_id'].agg('mean').reset_index(name='c_mean_sales_channel_id')
customers = pd.merge(customers, c_mean_sales_channel_id, on='customer_id', how='left')
customers['c_mean_sales_channel_id'] = customers['c_mean_sales_channel_id'].fillna(0).astype('float16')                
del c_mean_sales_channel_id; gc.collect()
customers['c_mean_sales_channel_id'].head()

In [None]:
# customerごとの平均is_for_male, key: customer_id
c_mean_is_for_male = joined_feature.groupby('customer_id')['is_for_male'].agg('mean').reset_index(name='c_mean_is_for_male')
customers = pd.merge(customers, c_mean_is_for_male, on='customer_id', how='left')
customers['c_mean_is_for_male'] = customers['c_mean_is_for_male'].fillna(0).astype('float16')                
del c_mean_is_for_male; gc.collect()
customers['c_mean_is_for_male'].head()

In [None]:
# customerごとの平均is_for_female, key: customer_id
c_mean_is_for_female = joined_feature.groupby('customer_id')['is_for_female'].agg('mean').reset_index(name='c_mean_is_for_female')
customers = pd.merge(customers, c_mean_is_for_female, on='customer_id', how='left')
customers['c_mean_is_for_female'] = customers['c_mean_is_for_female'].fillna(0).astype('float16')                
del c_mean_is_for_female; gc.collect()
customers['c_mean_is_for_female'].head()

In [None]:
# customerごとの平均is_for_mama, key: customer_id
c_mean_is_for_mama = joined_feature.groupby('customer_id')['is_for_mama'].agg('mean').reset_index(name='c_mean_is_for_mama')
customers = pd.merge(customers, c_mean_is_for_mama, on='customer_id', how='left')
customers['c_mean_is_for_mama'] = customers['c_mean_is_for_mama'].fillna(0).astype('float16')                
del c_mean_is_for_mama; gc.collect()
customers['c_mean_is_for_mama'].head()

In [None]:
# customerごとの最後の購入が何日前か, key: customer_id
tmp = tmp[['customer_id','diff_dat_last_buy']] # 以前計算したものを再利用
customers = pd.merge(customers, tmp, on='customer_id', how='left')
customers['diff_dat_last_buy'] = customers['diff_dat_last_buy'].fillna(0).astype('int16')                
del tmp; gc.collect()
customers['diff_dat_last_buy'].head()

### dynamic article features

In [None]:
# articleごとの平均購入価格, key: article_id
a_mean_price = joined_feature.groupby('article_id')['price'].agg('mean').reset_index(name='a_mean_price')
articles = pd.merge(articles, a_mean_price, on='article_id', how='left')
articles['a_mean_price'] = articles['a_mean_price'].fillna(0).astype('float32')                
del a_mean_price; gc.collect()
articles['a_mean_price'].head()

In [None]:
# articleごとの合計cv数, key: article_id
a_cv_total = joined_feature.groupby('article_id').size().reset_index(name='a_cv_total')
articles = pd.merge(articles, a_cv_total, on='article_id', how='left')
del a_cv_total; gc.collect()
articles['a_cv_total'] = articles['a_cv_total'].fillna(0).astype('int32')
articles['a_cv_total'].head()

In [None]:
# 直近1週間、articleごとの合計cv数, key: article_id
a_cv_recent = joined_recent.groupby('article_id').size().reset_index(name='a_cv_recent')
articles = pd.merge(articles, a_cv_recent, on='article_id', how='left')
del a_cv_recent; gc.collect()
articles['a_cv_recent'] = articles['a_cv_recent'].fillna(0).astype('int32')                
articles['a_cv_recent'].head()

In [None]:
# 直近1週間、articleごとの合計cv数, online
a_cv_recent_on = joined_recent.query('sales_channel_id == 2').groupby('article_id').size().reset_index(name='a_cv_recent_on')
articles = pd.merge(articles, a_cv_recent_on, on='article_id', how='left')
del a_cv_recent_on; gc.collect()
articles['a_cv_recent_on'] = articles['a_cv_recent_on'].fillna(0).astype('int32')                
articles['a_cv_recent_on'].head()

In [None]:
# 直近1週間、articleごとの合計cv数, offline
a_cv_recent_off = joined_recent.query('sales_channel_id == 1').groupby('article_id').size().reset_index(name='a_cv_recent_off')
articles = pd.merge(articles, a_cv_recent_off, on='article_id', how='left')
del a_cv_recent_off; gc.collect()
articles['a_cv_recent_off'] = articles['a_cv_recent_off'].fillna(0).astype('int32')                
articles['a_cv_recent_off'].head()

In [None]:
# 2 weeks ago、articleごとの合計cv数, key: article_id
a_cv_recent2 = joined_recent2.groupby('article_id').size().reset_index(name='a_cv_recent2')
articles = pd.merge(articles, a_cv_recent2, on='article_id', how='left')
del a_cv_recent2; gc.collect()
articles['a_cv_recent2'] = articles['a_cv_recent2'].fillna(0).astype('int32')                
articles['a_cv_recent2'].head()

In [None]:
# 直近1週間、articleごとの合計cv数ranking, key: article_id
articles['a_cv_recent_ranking'] = articles['a_cv_recent'].rank(method='min', ascending=False).astype('int16')
articles.head()

In [None]:
# 直近1週間、articleごとの合計cv数ranking, key: article_id
articles['a_cv_recent_ranking2'] = articles['a_cv_recent2'].rank(method='min', ascending=False).astype('int16')
articles.head()

In [None]:
# 2週前と1週前を比べたcvの比
articles['a_cv_recent_ratio'] = (articles['a_cv_recent'] + 0.01) / (articles['a_cv_recent2'] + 0.01)
articles['a_cv_recent_ratio'] = articles['a_cv_recent_ratio'].astype('float16')                
articles[['a_cv_recent', 'a_cv_recent2','a_cv_recent_ratio']].head()

In [None]:
# articleごとの平均sales_channel_id, key: article_id
a_mean_sales_channel_id = joined_feature.groupby('article_id')['sales_channel_id'].agg('mean').reset_index(name='a_mean_sales_channel_id')
articles = pd.merge(articles, a_mean_sales_channel_id, on='article_id', how='left')
articles['a_mean_sales_channel_id'] = articles['a_mean_sales_channel_id'].fillna(0).astype('float16')                
del a_mean_sales_channel_id; gc.collect()
articles['a_mean_sales_channel_id'].head()

In [None]:
# articleごとの平均age, key: article_id
a_mean_age = joined_feature.groupby('article_id')['age'].agg('mean').reset_index(name='a_mean_age')
articles = pd.merge(articles, a_mean_age, on='article_id', how='left')
articles['a_mean_age'] = articles['a_mean_age'].fillna(0).astype('int8')                
del a_mean_age; gc.collect()
articles['a_mean_age'].head()

In [None]:
# articleごとのage標準偏差, key: article_id
a_std_age = joined_feature.groupby('article_id')['age'].agg('std').reset_index(name='a_std_age')
articles = pd.merge(articles, a_std_age, on='article_id', how='left')
articles['a_std_age'] = articles['a_std_age'].fillna(0).astype('float16')                
del a_std_age; gc.collect()
articles['a_std_age'].head()

### join train

In [None]:
joined_train = pd.merge(train_group, customers, on='customer_id', how='left')
del train_group, customers; gc.collect()

In [None]:
joined_train = pd.merge(joined_train, articles, on='article_id', how='left')
del articles; gc.collect()

In [None]:
joined_train.head()

### dymanic interactive features

In [None]:
# customer, index_group_no (article素性)ごとの購入数
tmp_df = joined_feature.groupby(['customer_id', 'index_group_no']).size().reset_index(name='ca_index_group_no_num')
joined_train = pd.merge(joined_train, tmp_df, on=['customer_id', 'index_group_no'], how='left')
joined_train['ca_index_group_no_num'] = joined_train['ca_index_group_no_num'].fillna(0).astype('int32')                

# customerの全購入数で規格化、similarityみたいな量
joined_train['index_group_no_similarity'] = (joined_train['ca_index_group_no_num']/joined_train['c_cv_total']).astype('float32')
joined_train.head()

In [None]:
# customer, index_code (article素性)ごとの購入数
tmp_df = joined_feature.groupby(['customer_id', 'index_code']).size().reset_index(name='ca_index_code_num')
joined_train = pd.merge(joined_train, tmp_df, on=['customer_id', 'index_code'], how='left')
joined_train['ca_index_code_num'] = joined_train['ca_index_code_num'].fillna(0).astype('int32')                

# customerの全購入数で規格化、similalityみたいな量
joined_train['index_code_similarity'] = (joined_train['ca_index_code_num']/joined_train['c_cv_total']).fillna(0).astype('float32')
joined_train.head()

In [None]:
# customer, product_gropu_no (article素性)ごとの購入数
tmp_df = joined_feature.groupby(['customer_id', 'product_group_no']).size().reset_index(name='ca_product_group_no_num')
joined_train = pd.merge(joined_train, tmp_df, on=['customer_id', 'product_group_no'], how='left')
joined_train['ca_product_group_no_num'] = joined_train['ca_product_group_no_num'].fillna(0).astype('int32')                

# customerの全購入数で規格化、similalityみたいな量
joined_train['product_group_no_similarity'] = (joined_train['ca_product_group_no_num']/joined_train['c_cv_total']).astype('float32')
joined_train.head()

In [None]:
# 同じ商品を何回購入したことがあるか
tmp_df = joined_feature.groupby(['customer_id', 'article_id']).size().reset_index(name='buy_same_before')
tmp_df['buy_same_before'].fillna(0).astype('int16')
joined_train = pd.merge(joined_train, tmp_df, on=['customer_id', 'article_id'], how='left')
joined_train['buy_same_before'] = joined_train['buy_same_before'].fillna(0).astype('int16')                
joined_train.head()

In [None]:
# 直近1週間で同じ商品を直近何回購入したことがあるか
tmp_df = joined_recent.groupby(['customer_id', 'article_id']).size().reset_index(name='buy_same_before_recent')
joined_train = pd.merge(joined_train, tmp_df, on=['customer_id', 'article_id'], how='left')
joined_train['buy_same_before_recent'] = joined_train['buy_same_before_recent'].fillna(0).astype('int16')                
joined_train.head()

In [None]:
# 最後の購入から7日以内に同じ商品を直近何回購入したことがあるか
tmp_df = joined_last_week.groupby(['customer_id', 'article_id']).size().reset_index(name='buy_same_last_week')
joined_train = pd.merge(joined_train, tmp_df, on=['customer_id', 'article_id'], how='left')
joined_train['buy_same_last_week'] = joined_train['buy_same_last_week'].fillna(0).astype('int16')                
joined_train.head()

In [None]:
del tmp_df; gc.collect()

In [None]:
# delta_price
joined_train['delta_mean_price'] = (joined_train['c_mean_price'] - joined_train['a_mean_price']).astype('float16')
joined_train.head()

In [None]:
# delta_age
joined_train['delta_mean_age'] = (joined_train['age'] - joined_train['a_mean_age']).astype('int8')
joined_train.head()

In [None]:
# delta_mean_sales_channel_id
joined_train['delta_mean_sales_channel_id'] = (joined_train['c_mean_sales_channel_id'] - joined_train['a_mean_sales_channel_id']).astype('float16')
joined_train.head()

In [None]:
# delta_mean_is_for_male
joined_train['delta_mean_is_for_male'] = (joined_train['c_mean_is_for_male'] - joined_train['is_for_male']).astype('float16')
joined_train.head()

In [None]:
# delta_mean_is_for_male
joined_train['delta_mean_is_for_female'] = (joined_train['c_mean_is_for_female'] - joined_train['is_for_female']).astype('float16')
joined_train.head()

In [None]:
# delta_mean_is_for_mama
joined_train['delta_mean_is_for_mama'] = (joined_train['c_mean_is_for_mama'] - joined_train['is_for_mama']).astype('float16')
joined_train.head()

In [None]:
#del joined_feature, joined_recent; gc.collect()
del joined_recent, joined_last_week; gc.collect()

### additional feature

In [None]:
import numpy as np
pairs = np.load(f'../input/pairs_np_{train_start_date}_2.pkl', allow_pickle=True)
#pairs = np.load(f'./pairs_np_2020-09-09_2.pkl', allow_pickle=True)

In [None]:
joined_feature = joined_feature[['t_dat', 'customer_id', 'article_id']]

In [None]:
pairs_df = pd.DataFrame(list(pairs.items()), columns=['article_id', 'article_id_list'])
pairs_df['article_id_rank1'] = pairs_df['article_id_list'].map(lambda x: x[0])
pairs_df['article_id_rank2'] = pairs_df['article_id_list'].map(lambda x: x[1])
pairs_df = pairs_df.drop('article_id_list', axis='columns')

In [None]:
pairs_df.head()

In [None]:
joined_feature = pd.merge(joined_feature, pairs_df, on='article_id', how='left')

In [None]:
del pairs, pairs_df; gc.collect()

In [None]:
# 共起1位の商品を何回購入したことがあるか
tmp_df = joined_feature.groupby(['customer_id', 'article_id_rank1']).size().reset_index(name='buy_rank1_before')
tmp_df['buy_rank1_before'].fillna(0).astype('int16')
tmp_df = tmp_df.rename(columns={'article_id_rank1':'article_id'})

joined_train = pd.merge(joined_train, tmp_df, on=['customer_id', 'article_id'], how='left')
joined_train['buy_rank1_before'] = joined_train['buy_rank1_before'].fillna(0).astype('int16')                
joined_train.head()

In [None]:
# 共起2位の商品を何回購入したことがあるか
tmp_df = joined_feature.groupby(['customer_id', 'article_id_rank2']).size().reset_index(name='buy_rank2_before')
tmp_df['buy_rank2_before'].fillna(0).astype('int16')
tmp_df = tmp_df.rename(columns={'article_id_rank2':'article_id'})

joined_train = pd.merge(joined_train, tmp_df, on=['customer_id', 'article_id'], how='left')
joined_train['buy_rank2_before'] = joined_train['buy_rank2_before'].fillna(0).astype('int16')                
joined_train.head()

In [None]:
# 共起1位の商品を何回したのは最後の購入から何日前か
tmp_df = joined_feature.groupby(['customer_id', 'article_id_rank1']).t_dat.max().reset_index()
tmp_df = tmp_df.rename(columns={'t_dat':'max_dat'})
tmp_df = tmp_df.rename(columns={'article_id_rank1':'article_id'})
tmp_df['diff_dat_last_buy_rank1'] = (train_start_date_dt - tmp_df['max_dat']).dt.days
tmp_df = tmp_df.drop('max_dat', axis='columns')
joined_train = pd.merge(joined_train, tmp_df, on=['customer_id', 'article_id'], how='left')
joined_train['diff_dat_last_buy_rank1'] = joined_train['diff_dat_last_buy_rank1'].fillna(999).astype('int16')                
joined_train.head()

In [None]:
# 共起2位の商品を何回したのは最後の購入から何日前か
tmp_df = joined_feature.groupby(['customer_id', 'article_id_rank2']).t_dat.max().reset_index()
tmp_df = tmp_df.rename(columns={'t_dat':'max_dat'})
tmp_df = tmp_df.rename(columns={'article_id_rank2':'article_id'})
tmp_df['diff_dat_last_buy_rank2'] = (train_start_date_dt - tmp_df['max_dat']).dt.days
tmp_df = tmp_df.drop('max_dat', axis='columns')
joined_train = pd.merge(joined_train, tmp_df, on=['customer_id', 'article_id'], how='left')
joined_train['diff_dat_last_buy_rank2'] = joined_train['diff_dat_last_buy_rank2'].fillna(999).astype('int16')                
joined_train.head()

In [None]:
del tmp_df, joined_feature; gc.collect()

## Save

In [None]:
if full_flag:
    joined_train.to_csv(f'train_{train_start_date}_full.csv', index=False)
else:
    joined_train.to_csv(f'train_{train_start_date}.csv', index=False)
    
del joined_train; gc.collect()