### Generate Testing data

In [1]:
import numpy as np
import pandas as pd
import gc
import os
import time
import random
from tqdm.auto import tqdm
import datetime

In [2]:
def visualize_df(df):
    print(df.shape)
    display(df.head())

## Parameters

In [3]:
from datetime import datetime, date, timedelta
feature_date = '2020-09-16' 
test_date_dt = pd.to_datetime(feature_date) + timedelta(days=7)
Nval = 30

## Read Data

In [4]:
# データの読み込み
dir = 'h-and-m-personalized-fashion-recommendations/'
path = '../input/' + dir 

transactions_train = pd.read_csv(path + 'transactions_train.csv')

In [5]:
# save memory https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
# 64文字もあるが、末尾16文字だけでcustomerを一意にid可能
# 使われている文字は0-9, a-fなので16進数で変換してintで扱える
# 16進数 = 4bit, 4bit * 16 / 8 = 8 byte
# 64 byte -> 8 byteに削減, int64で表現可能
transactions_train['customer_id'] = transactions_train['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')

In [6]:
# 提出の形式に合わせてarticle_idの最初に0を加える
#transactions_train['article_id'] = transactions_train['article_id'].map(lambda x: '0' + str(x))
# メモリ削減、復元するには上記の式
transactions_train['article_id'] = transactions_train['article_id'].astype('int32')

In [7]:
transactions_train['t_dat'] = pd.to_datetime(transactions_train['t_dat'])
transactions_train.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,-6846340800584936,663713001,0.050831,2
1,2018-09-20,-6846340800584936,541518023,0.030492,2
2,2018-09-20,-8334631767138808638,505221004,0.015237,2
3,2018-09-20,-8334631767138808638,685687003,0.016932,2
4,2018-09-20,-8334631767138808638,685687004,0.016932,2


In [8]:
feature_data = transactions_train.reset_index(drop=True) # train feature
#feature_data = transactions_train.query(f" t_dat < '{train_start_date}'").reset_index(drop=True) # train feature

In [9]:
del transactions_train; gc.collect()

0

In [10]:
feature_data

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,-6846340800584936,663713001,0.050831,2
1,2018-09-20,-6846340800584936,541518023,0.030492,2
2,2018-09-20,-8334631767138808638,505221004,0.015237,2
3,2018-09-20,-8334631767138808638,685687003,0.016932,2
4,2018-09-20,-8334631767138808638,685687004,0.016932,2
...,...,...,...,...,...
31788319,2020-09-22,4685485978980270934,929511001,0.059305,2
31788320,2020-09-22,4685485978980270934,891322004,0.042356,2
31788321,2020-09-22,3959348689921271969,918325001,0.043203,1
31788322,2020-09-22,-8639340045377511665,833459002,0.006763,1


## Customer features

- customer_idごとに素性を作成

In [11]:
customers = pd.read_csv(path + 'customers.csv')
customers

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...
...,...,...,...,...,...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,,,ACTIVE,NONE,24.0,7aa399f7e669990daba2d92c577b52237380662f36480b...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,,,ACTIVE,NONE,21.0,3f47f1279beb72215f4de557d950e0bfa73789d24acb5e...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,1.0,1.0,ACTIVE,Regularly,21.0,4563fc79215672cd6a863f2b4bf56b8f898f2d96ed590e...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,1.0,1.0,ACTIVE,Regularly,18.0,8892c18e9bc3dca6aa4000cb8094fc4b51ee8db2ed14d7...


In [12]:
print(customers.memory_usage())
print(customers.dtypes)

Index                          128
customer_id               10975840
FN                        10975840
Active                    10975840
club_member_status        10975840
fashion_news_frequency    10975840
age                       10975840
postal_code               10975840
dtype: int64
customer_id                object
FN                        float64
Active                    float64
club_member_status         object
fashion_news_frequency     object
age                       float64
postal_code                object
dtype: object


In [13]:
customers['customer_id'] = customers['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')
customers['age'] = customers['age'].fillna(0).astype('int8')
customers['FN'] = customers['FN'].fillna(0).astype('bool')
customers['Active'] = customers['Active'].fillna(0).astype('bool')
customers['club_member_status'] = customers['club_member_status'].fillna('NA').astype('category').cat.codes
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].fillna('NA').astype('category').cat.codes
# 長いのでuniqueになるようcategory型にした後に番号をふる
customers['postal_code'] = customers['postal_code'].fillna('NA').astype('category').cat.codes
#customers['postal_code'] = customers['postal_code'].fillna('NA').astype('category')

In [14]:
print(customers.memory_usage())
print(customers.dtypes)

Index                          128
customer_id               10975840
FN                         1371980
Active                     1371980
club_member_status         1371980
fashion_news_frequency     1371980
age                        1371980
postal_code                5487920
dtype: int64
customer_id               int64
FN                         bool
Active                     bool
club_member_status         int8
fashion_news_frequency     int8
age                        int8
postal_code               int32
dtype: object


In [15]:
print(customers['club_member_status'].value_counts())
print(customers['club_member_status'].value_counts())
print(customers['fashion_news_frequency'].value_counts())
print(customers['postal_code'].value_counts)

0    1272491
3      92960
2       6062
1        467
Name: club_member_status, dtype: int64
0    1272491
3      92960
2       6062
1        467
Name: club_member_status, dtype: int64
2    877711
4    477416
1     16009
0       842
3         2
Name: fashion_news_frequency, dtype: int64
<bound method IndexOpsMixin.value_counts of 0          112978
1           57312
2          139156
3          128529
4           52371
            ...  
1371975    169171
1371976     87255
1371977     95707
1371978    188279
1371979     13927
Name: postal_code, Length: 1371980, dtype: int32>


In [16]:
customers

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,6883939031699146327,False,False,0,2,49,112978
1,-7200416642310594310,False,False,0,2,25,57312
2,-6846340800584936,False,False,0,2,24,139156
3,-94071612138601410,False,False,0,2,54,128529
4,-283965518499174310,True,True,0,4,52,52371
...,...,...,...,...,...,...,...
1371975,7551062398649767985,False,False,0,2,24,169171
1371976,-9141402131989464905,False,False,0,2,21,87255
1371977,-8286316756823862684,True,True,0,4,21,95707
1371978,2551401172826382186,True,True,0,4,18,188279


## Article features

In [17]:
articles = pd.read_csv(path + 'articles.csv')
articles.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [18]:
print(articles.memory_usage())
print(articles.dtypes)

Index                              128
article_id                      844336
product_code                    844336
prod_name                       844336
product_type_no                 844336
product_type_name               844336
product_group_name              844336
graphical_appearance_no         844336
graphical_appearance_name       844336
colour_group_code               844336
colour_group_name               844336
perceived_colour_value_id       844336
perceived_colour_value_name     844336
perceived_colour_master_id      844336
perceived_colour_master_name    844336
department_no                   844336
department_name                 844336
index_code                      844336
index_name                      844336
index_group_no                  844336
index_group_name                844336
section_no                      844336
section_name                    844336
garment_group_no                844336
garment_group_name              844336
detail_desc              

In [19]:
articles['article_id'] = articles['article_id'].astype('int32')

In [20]:
# add gender features
# https://www.kaggle.com/code/lichtlab/h-m-data-deep-dive-chap-1-understand-article
def set_gender_flg(x):
    x['is_for_male'] = 0
    x['is_for_female'] = 0
    x['is_for_mama'] = 0
    if x['index_group_name'] in ['Ladieswear','Divided']:
        x['is_for_female'] = 1
    if x['index_group_name'] == 'Menswear':
        x['is_for_male'] = 1
    if x['index_group_name'] in ['Baby/Children','Sport']:
        if 'boy' in x['department_name'].lower() or 'men' in x['department_name'].lower():
            x['is_for_male'] = 1
        if 'girl' in x['department_name'].lower() or 'ladies' in x['department_name'].lower():
            x['is_for_female'] = 1
    if x['section_name'] == 'Mama':
        x['is_for_mama'] = 1
    return x

In [21]:
articles = articles.apply(set_gender_flg, axis=1)
articles.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc,is_for_male,is_for_female,is_for_mama
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,0,1,0
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,0,1,0
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,0,1,0
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde...",0,1,0
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde...",0,1,0


In [22]:
# idがあるカラムのnameは消す
drop_list = ['product_type_name', 
             'graphical_appearance_name', 
             'colour_group_name', 
             'perceived_colour_value_name', 
             'perceived_colour_master_name',
             'department_name',
             'index_name',
             'index_group_name',
             'section_name',
             'garment_group_name',
             'prod_name',
             'detail_desc'
            ]

articles = articles.drop(drop_list, axis='columns')

In [23]:
articles['product_code']                = articles['product_code'].fillna(0).astype('int32')                
articles['product_type_no']             = articles['product_type_no'].astype('int32')               
articles['graphical_appearance_no']     = articles['graphical_appearance_no'].astype('int32') 
articles['colour_group_code']           = articles['colour_group_code'].astype('int32')   
articles['perceived_colour_value_id']   = articles['perceived_colour_value_id'].astype('int32')     
articles['perceived_colour_master_id']  = articles['perceived_colour_master_id'].astype('int32')    
articles['department_no']               = articles['department_no'].astype('category').cat.codes                 
articles['index_code']                  = articles['index_code'].astype('category').cat.codes                 
articles['index_group_no']              = articles['index_group_no'].astype('int8')                
articles['section_no']                  = articles['section_no'].astype('int8')                    
articles['garment_group_no']            = articles['garment_group_no'].astype('int16')              
articles['product_group_no']            = articles['product_group_name'].fillna('NA').astype('category').cat.codes.astype('int32') # nameからnoを生成
articles['is_for_male']                 = articles['is_for_male'].astype('bool')              
articles['is_for_female']               = articles['is_for_female'].astype('bool')              
articles['is_for_mama']                 = articles['is_for_mama'].astype('bool')              
articles = articles.drop('product_group_name', axis='columns')

In [24]:
articles

Unnamed: 0,article_id,product_code,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no,is_for_male,is_for_female,is_for_mama,product_group_no
0,108775015,108775,253,1010016,9,4,5,47,0,1,16,1002,False,True,False,7
1,108775044,108775,253,1010016,10,3,9,47,0,1,16,1002,False,True,False,7
2,108775051,108775,253,1010017,11,1,9,47,0,1,16,1002,False,True,False,7
3,110065001,110065,306,1010016,9,4,5,12,1,1,61,1017,False,True,False,16
4,110065002,110065,306,1010016,10,3,9,12,1,1,61,1017,False,True,False,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105537,953450001,953450,302,1010014,9,4,5,224,4,3,26,1021,True,False,False,13
105538,953763001,953763,253,1010016,9,4,5,66,0,1,2,1005,False,True,False,7
105539,956217002,956217,265,1010016,9,4,5,34,0,1,18,1005,False,True,False,5
105540,957375001,957375,72,1010016,9,4,5,120,3,2,52,1019,False,True,False,0


## dynamic features

In [25]:
feature_data.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,-6846340800584936,663713001,0.050831,2
1,2018-09-20,-6846340800584936,541518023,0.030492,2
2,2018-09-20,-8334631767138808638,505221004,0.015237,2
3,2018-09-20,-8334631767138808638,685687003,0.016932,2
4,2018-09-20,-8334631767138808638,685687004,0.016932,2


In [26]:
joined_feature = pd.merge(feature_data, customers, on='customer_id', how='left')
joined_feature = pd.merge(joined_feature, articles, on='article_id', how='left')
del feature_data; gc.collect()

0

In [27]:
tmp_dt = joined_feature.groupby('customer_id').t_dat.max().reset_index()
tmp_dt = tmp_dt.rename(columns={'t_dat':'max_dat'})
tmp_dt['diff_dat_last_buy'] = (test_date_dt - tmp_dt['max_dat']).dt.days
tmp_dt.columns = ['customer_id','max_dat', 'diff_dat_last_buy']
joined_feature = joined_feature.merge(tmp_dt,on=['customer_id'],how='left')
#del tmp; gc.collect()

In [28]:
joined_feature['diff_dat'] = (joined_feature['max_dat'] - joined_feature['t_dat']).dt.days
joined_feature = joined_feature.drop('max_dat', axis='columns')
joined_last_week = joined_feature.loc[joined_feature['diff_dat']<=6]

In [29]:
joined_feature.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,FN,Active,club_member_status,fashion_news_frequency,age,...,index_code,index_group_no,section_no,garment_group_no,is_for_male,is_for_female,is_for_mama,product_group_no,diff_dat_last_buy,diff_dat
0,2018-09-20,-6846340800584936,663713001,0.050831,2,False,False,0,2,24,...,1,1,61,1017,False,True,False,16,8,726
1,2018-09-20,-6846340800584936,541518023,0.030492,2,False,False,0,2,24,...,1,1,61,1017,False,True,False,16,8,726
2,2018-09-20,-8334631767138808638,505221004,0.015237,2,True,True,0,4,32,...,3,2,58,1003,False,True,False,7,133,601
3,2018-09-20,-8334631767138808638,685687003,0.016932,2,True,True,0,4,32,...,0,1,15,1023,False,True,False,7,133,601
4,2018-09-20,-8334631767138808638,685687004,0.016932,2,True,True,0,4,32,...,0,1,15,1023,False,True,False,7,133,601


In [30]:
joined_last_week.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,FN,Active,club_member_status,fashion_news_frequency,age,...,index_code,index_group_no,section_no,garment_group_no,is_for_male,is_for_female,is_for_mama,product_group_no,diff_dat_last_buy,diff_dat
49,2018-09-20,-5667465470176998279,649356002,0.027441,1,True,True,0,4,41,...,6,4,46,1003,True,False,False,7,734,0
50,2018-09-20,-5667465470176998279,579941002,0.019814,1,True,True,0,4,41,...,5,4,41,1006,True,False,False,6,734,0
51,2018-09-20,-5667465470176998279,629760002,0.015237,1,True,True,0,4,41,...,5,4,41,1006,True,False,False,6,734,0
52,2018-09-20,-5667465470176998279,625229004,0.019814,1,True,True,0,4,41,...,5,4,41,1005,True,False,False,6,734,0
61,2018-09-20,-232048505547517783,613456009,0.016932,2,True,True,0,4,28,...,0,1,15,1005,False,True,False,7,734,0


In [31]:
joined_recent = joined_feature.query(f"'{feature_date}' <= t_dat").reset_index(drop=True)

### dynamic customer features

In [32]:
# customerごとの平均購入価格, key: customer_id
c_mean_price = joined_feature.groupby('customer_id')['price'].agg('mean').reset_index(name='c_mean_price')
customers = pd.merge(customers, c_mean_price, on='customer_id', how='left')
customers['c_mean_price'] = customers['c_mean_price'].fillna(0).astype('float16')                
del c_mean_price; gc.collect()
customers['c_mean_price'].head()

0    0.030899
1    0.030258
2    0.039154
3    0.030487
4    0.036133
Name: c_mean_price, dtype: float16

In [33]:
# customerごとの購入価格の標準偏差, key: customer_id
c_std_price = joined_feature.groupby('customer_id')['price'].agg('std').reset_index(name='c_std_price')
customers = pd.merge(customers, c_std_price, on='customer_id', how='left')
customers['c_std_price'] = customers['c_std_price'].fillna(0).astype('float16')                
del c_std_price; gc.collect()
customers['c_std_price'].head()

0    0.015717
1    0.016953
2    0.016861
3    0.000000
4    0.012634
Name: c_std_price, dtype: float16

In [34]:
# customerごとの合計cv数, key: customer_id
c_cv_total = joined_feature.groupby('customer_id').size().reset_index(name='c_cv_total')
customers = pd.merge(customers, c_cv_total, on='customer_id', how='left')
del c_cv_total; gc.collect()
customers['c_cv_total'] = customers['c_cv_total'].fillna(0).astype('int32')
customers['c_cv_total'].head()

0    21
1    86
2    18
3     2
4    13
Name: c_cv_total, dtype: int32

In [35]:
# 直近1週間、customerごとの合計cv数, key: customer_id
c_cv_recent = joined_recent.groupby('customer_id').size().reset_index(name='c_cv_recent')
customers = pd.merge(customers, c_cv_recent, on='customer_id', how='left')
del c_cv_recent; gc.collect()
customers['c_cv_recent'] = customers['c_cv_recent'].fillna(0).astype('int32')
customers['c_cv_recent'].head()

0    0
1    0
2    0
3    0
4    0
Name: c_cv_recent, dtype: int32

In [36]:
# customerごとの平均sales_channel_id, key: customer_id
c_mean_sales_channel_id = joined_feature.groupby('customer_id')['sales_channel_id'].agg('mean').reset_index(name='c_mean_sales_channel_id')
customers = pd.merge(customers, c_mean_sales_channel_id, on='customer_id', how='left')
customers['c_mean_sales_channel_id'] = customers['c_mean_sales_channel_id'].fillna(0).astype('float16')                
del c_mean_sales_channel_id; gc.collect()
customers['c_mean_sales_channel_id'].head()

0    1.571289
1    1.941406
2    2.000000
3    2.000000
4    1.845703
Name: c_mean_sales_channel_id, dtype: float16

In [37]:
# customerごとの平均is_for_male, key: customer_id
c_mean_is_for_male = joined_feature.groupby('customer_id')['is_for_male'].agg('mean').reset_index(name='c_mean_is_for_male')
customers = pd.merge(customers, c_mean_is_for_male, on='customer_id', how='left')
customers['c_mean_is_for_male'] = customers['c_mean_is_for_male'].fillna(0).astype('float16')                
del c_mean_is_for_male; gc.collect()
customers['c_mean_is_for_male'].head()

0    0.142822
1    0.011627
2    0.222168
3    0.000000
4    0.000000
Name: c_mean_is_for_male, dtype: float16

In [38]:
# customerごとの平均is_for_female, key: customer_id
c_mean_is_for_female = joined_feature.groupby('customer_id')['is_for_female'].agg('mean').reset_index(name='c_mean_is_for_female')
customers = pd.merge(customers, c_mean_is_for_female, on='customer_id', how='left')
customers['c_mean_is_for_female'] = customers['c_mean_is_for_female'].fillna(0).astype('float16')                
del c_mean_is_for_female; gc.collect()
customers['c_mean_is_for_female'].head()

0    0.856934
1    0.976562
2    0.777832
3    1.000000
4    1.000000
Name: c_mean_is_for_female, dtype: float16

In [39]:
# customerごとの平均is_for_mama, key: customer_id
c_mean_is_for_mama = joined_feature.groupby('customer_id')['is_for_mama'].agg('mean').reset_index(name='c_mean_is_for_mama')
customers = pd.merge(customers, c_mean_is_for_mama, on='customer_id', how='left')
customers['c_mean_is_for_mama'] = customers['c_mean_is_for_mama'].fillna(0).astype('float16')                
del c_mean_is_for_mama; gc.collect()
customers['c_mean_is_for_mama'].head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: c_mean_is_for_mama, dtype: float16

In [41]:
# customerごとの最後の購入が何日前か, key: customer_id
tmp_dt = tmp_dt[['customer_id','diff_dat_last_buy']] # 以前計算したものを再利用
customers = pd.merge(customers, tmp_dt, on='customer_id', how='left')
customers['diff_dat_last_buy'] = customers['diff_dat_last_buy'].fillna(0).astype('int16')                
del tmp_dt; gc.collect()
customers['diff_dat_last_buy'].head()

0     18
1     77
2      8
3    472
4     42
Name: diff_dat_last_buy, dtype: int16

### dynamic article features

In [43]:
# articleごとの平均購入価格, key: article_id
a_mean_price = joined_feature.groupby('article_id')['price'].agg('mean').reset_index(name='a_mean_price')
articles = pd.merge(articles, a_mean_price, on='article_id', how='left')
articles['a_mean_price'] = articles['a_mean_price'].fillna(0).astype('float32')                
del a_mean_price; gc.collect()
articles['a_mean_price'].head()

0    0.008142
1    0.008114
2    0.004980
3    0.020219
4    0.018205
Name: a_mean_price, dtype: float32

In [44]:
# articleごとの合計cv数, key: article_id
a_cv_total = joined_feature.groupby('article_id').size().reset_index(name='a_cv_total')
articles = pd.merge(articles, a_cv_total, on='article_id', how='left')
del a_cv_total; gc.collect()
articles['a_cv_total'] = articles['a_cv_total'].fillna(0).astype('int32')
articles['a_cv_total'].head()

0    10841
1     7250
2      215
3     1044
4      539
Name: a_cv_total, dtype: int32

In [45]:
# 直近1週間、articleごとの合計cv数, key: article_id
a_cv_recent = joined_recent.groupby('article_id').size().reset_index(name='a_cv_recent')
articles = pd.merge(articles, a_cv_recent, on='article_id', how='left')
del a_cv_recent; gc.collect()
articles['a_cv_recent'] = articles['a_cv_recent'].fillna(0).astype('int32')                
articles['a_cv_recent'].head()

0    0
1    3
2    0
3    0
4    0
Name: a_cv_recent, dtype: int32

In [46]:
# 直近1週間、articleごとの合計cv数ranking, key: article_id
articles['a_cv_recent_ranking'] = articles['a_cv_recent'].rank(method='min', ascending=False).astype('int16')
articles.head()

Unnamed: 0,article_id,product_code,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no,is_for_male,is_for_female,is_for_mama,product_group_no,a_mean_price,a_cv_total,a_cv_recent,a_cv_recent_ranking
0,108775015,108775,253,1010016,9,4,5,47,0,1,16,1002,False,True,False,7,0.008142,10841,0,17987
1,108775044,108775,253,1010016,10,3,9,47,0,1,16,1002,False,True,False,7,0.008114,7250,3,8543
2,108775051,108775,253,1010017,11,1,9,47,0,1,16,1002,False,True,False,7,0.00498,215,0,17987
3,110065001,110065,306,1010016,9,4,5,12,1,1,61,1017,False,True,False,16,0.020219,1044,0,17987
4,110065002,110065,306,1010016,10,3,9,12,1,1,61,1017,False,True,False,16,0.018205,539,0,17987


In [47]:
# articleごとの平均sales_channel_id, key: article_id
a_mean_sales_channel_id = joined_feature.groupby('article_id')['sales_channel_id'].agg('mean').reset_index(name='a_mean_sales_channel_id')
articles = pd.merge(articles, a_mean_sales_channel_id, on='article_id', how='left')
articles['a_mean_sales_channel_id'] = articles['a_mean_sales_channel_id'].fillna(0).astype('float16')                
del a_mean_sales_channel_id; gc.collect()
articles['a_mean_sales_channel_id'].head()

0    1.770508
1    1.709961
2    1.995117
3    1.375000
4    1.655273
Name: a_mean_sales_channel_id, dtype: float16

In [48]:
# articleごとの平均age, key: article_id
a_mean_age = joined_feature.groupby('article_id')['age'].agg('mean').reset_index(name='a_mean_age')
articles = pd.merge(articles, a_mean_age, on='article_id', how='left')
articles['a_mean_age'] = articles['a_mean_age'].fillna(0).astype('int8')                
del a_mean_age; gc.collect()
articles['a_mean_age'].head()

0    34
1    35
2    35
3    37
4    39
Name: a_mean_age, dtype: int8

In [49]:
# articleごとのage標準偏差, key: article_id
a_std_age = joined_feature.groupby('article_id')['age'].agg('std').reset_index(name='a_std_age')
articles = pd.merge(articles, a_std_age, on='article_id', how='left')
articles['a_std_age'] = articles['a_std_age'].fillna(0).astype('float16')                
del a_std_age; gc.collect()
articles['a_std_age'].head()

0    11.968750
1    12.812500
2    12.351562
3    12.289062
4    12.796875
Name: a_std_age, dtype: float16

## Read Testing data

In [50]:
# testing dataの読み込み
test = pd.read_csv(f'./cf_{feature_date}_{Nval}.csv')

In [51]:
# tmp
test = pd.read_csv(f'../input/submission_30.csv')
test['customer_id'] = test['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')
#test['article_id'] = test['article_id'].astype('int32')
test.head()

Unnamed: 0,customer_id,prediction
0,-2324495172285268445,0736530007 0770703004 0736531006 0706016001 09...
1,7487674655891994853,0904422001 0399061026 0706016001 0399061008 09...
2,2122353054375354338,0673901005 0673901012 0706016002 0706016006 07...
3,-3888929115027471328,0632803014 0657852006 0657852008 0669091018 06...
4,3501355564022557122,0933706001 0896152002 0924243001 0924243002 09...


In [52]:
test['prediction'] = test['prediction'].map(lambda x: x.split(' '))
test = test.explode('prediction').reset_index(drop=True)

In [53]:
test = test.rename(columns={'prediction':'article_id'})
test['article_id'] = test['article_id'].astype('int32')
test

Unnamed: 0,customer_id,article_id
0,-2324495172285268445,736530007
1,-2324495172285268445,770703004
2,-2324495172285268445,736531006
3,-2324495172285268445,706016001
4,-2324495172285268445,924243001
...,...,...
41159395,-4411265289383494801,788575004
41159396,-4411265289383494801,928206001
41159397,-4411265289383494801,573085028
41159398,-4411265289383494801,751471043


In [54]:
joined_test = pd.merge(test, customers, on='customer_id', how='left')
del customers, test; gc.collect()

joined_test = pd.merge(joined_test, articles, on='article_id', how='left')
del articles; gc.collect()

0

In [55]:
joined_test.head()

Unnamed: 0,customer_id,article_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,c_mean_price,c_std_price,...,is_for_female,is_for_mama,product_group_no,a_mean_price,a_cv_total,a_cv_recent,a_cv_recent_ranking,a_mean_sales_channel_id,a_mean_age,a_std_age
0,-2324495172285268445,736530007,True,False,0,4,21,183703,0.023712,0.004795,...,True,False,16,0.024174,9222,5,6564,1.712891,30,11.929688
1,-2324495172285268445,770703004,True,False,0,4,21,183703,0.023712,0.004795,...,True,False,7,0.02955,643,0,17987,1.763672,33,14.148438
2,-2324495172285268445,736531006,True,False,0,4,21,183703,0.023712,0.004795,...,True,False,16,0.009669,6153,0,17987,1.696289,31,11.632812
3,-2324495172285268445,706016001,True,False,0,4,21,183703,0.023712,0.004795,...,True,False,6,0.032448,50287,329,33,1.845703,32,12.46875
4,-2324495172285268445,924243001,True,False,0,4,21,183703,0.023712,0.004795,...,True,False,7,0.041505,1705,852,1,1.652344,36,14.929688


### dymanic interactive features

In [56]:
# customer, index_group_no (article素性)ごとの購入数
tmp_df = joined_feature.groupby(['customer_id', 'index_group_no']).size().reset_index(name='ca_index_group_no_num')
joined_test = pd.merge(joined_test, tmp_df, on=['customer_id', 'index_group_no'], how='left')
joined_test['ca_index_group_no_num'] = joined_test['ca_index_group_no_num'].fillna(0).astype('int32')                

# customerの全購入数で規格化、similarityみたいな量
joined_test['index_group_no_similarity'] = (joined_test['ca_index_group_no_num']/joined_test['c_cv_total']).astype('float32')
joined_test.head()

Unnamed: 0,customer_id,article_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,c_mean_price,c_std_price,...,product_group_no,a_mean_price,a_cv_total,a_cv_recent,a_cv_recent_ranking,a_mean_sales_channel_id,a_mean_age,a_std_age,ca_index_group_no_num,index_group_no_similarity
0,-2324495172285268445,736530007,True,False,0,4,21,183703,0.023712,0.004795,...,16,0.024174,9222,5,6564,1.712891,30,11.929688,1,0.5
1,-2324495172285268445,770703004,True,False,0,4,21,183703,0.023712,0.004795,...,7,0.02955,643,0,17987,1.763672,33,14.148438,1,0.5
2,-2324495172285268445,736531006,True,False,0,4,21,183703,0.023712,0.004795,...,16,0.009669,6153,0,17987,1.696289,31,11.632812,1,0.5
3,-2324495172285268445,706016001,True,False,0,4,21,183703,0.023712,0.004795,...,6,0.032448,50287,329,33,1.845703,32,12.46875,1,0.5
4,-2324495172285268445,924243001,True,False,0,4,21,183703,0.023712,0.004795,...,7,0.041505,1705,852,1,1.652344,36,14.929688,1,0.5


In [57]:
# customer, index_code (article素性)ごとの購入数
tmp_df = joined_feature.groupby(['customer_id', 'index_code']).size().reset_index(name='ca_index_code_num')
joined_test = pd.merge(joined_test, tmp_df, on=['customer_id', 'index_code'], how='left')
joined_test['ca_index_code_num'] = joined_test['ca_index_code_num'].fillna(0).astype('int32')                

# customerの全購入数で規格化、similalityみたいな量
joined_test['index_code_similarity'] = (joined_test['ca_index_code_num']/joined_test['c_cv_total']).fillna(0).astype('float32')
joined_test.head()

Unnamed: 0,customer_id,article_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,c_mean_price,c_std_price,...,a_cv_total,a_cv_recent,a_cv_recent_ranking,a_mean_sales_channel_id,a_mean_age,a_std_age,ca_index_group_no_num,index_group_no_similarity,ca_index_code_num,index_code_similarity
0,-2324495172285268445,736530007,True,False,0,4,21,183703,0.023712,0.004795,...,9222,5,6564,1.712891,30,11.929688,1,0.5,1,0.5
1,-2324495172285268445,770703004,True,False,0,4,21,183703,0.023712,0.004795,...,643,0,17987,1.763672,33,14.148438,1,0.5,1,0.5
2,-2324495172285268445,736531006,True,False,0,4,21,183703,0.023712,0.004795,...,6153,0,17987,1.696289,31,11.632812,1,0.5,1,0.5
3,-2324495172285268445,706016001,True,False,0,4,21,183703,0.023712,0.004795,...,50287,329,33,1.845703,32,12.46875,1,0.5,1,0.5
4,-2324495172285268445,924243001,True,False,0,4,21,183703,0.023712,0.004795,...,1705,852,1,1.652344,36,14.929688,1,0.5,0,0.0


In [58]:
# customer, product_gropu_no (article素性)ごとの購入数
tmp_df = joined_feature.groupby(['customer_id', 'product_group_no']).size().reset_index(name='ca_product_group_no_num')
joined_test = pd.merge(joined_test, tmp_df, on=['customer_id', 'product_group_no'], how='left')
joined_test['ca_product_group_no_num'] = joined_test['ca_product_group_no_num'].fillna(0).astype('int32')                

# customerの全購入数で規格化、similalityみたいな量
joined_test['product_group_no_similarity'] = (joined_test['ca_product_group_no_num']/joined_test['c_cv_total']).astype('float32')
joined_test.head()

Unnamed: 0,customer_id,article_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,c_mean_price,c_std_price,...,a_cv_recent_ranking,a_mean_sales_channel_id,a_mean_age,a_std_age,ca_index_group_no_num,index_group_no_similarity,ca_index_code_num,index_code_similarity,ca_product_group_no_num,product_group_no_similarity
0,-2324495172285268445,736530007,True,False,0,4,21,183703,0.023712,0.004795,...,6564,1.712891,30,11.929688,1,0.5,1,0.5,1,0.5
1,-2324495172285268445,770703004,True,False,0,4,21,183703,0.023712,0.004795,...,17987,1.763672,33,14.148438,1,0.5,1,0.5,1,0.5
2,-2324495172285268445,736531006,True,False,0,4,21,183703,0.023712,0.004795,...,17987,1.696289,31,11.632812,1,0.5,1,0.5,1,0.5
3,-2324495172285268445,706016001,True,False,0,4,21,183703,0.023712,0.004795,...,33,1.845703,32,12.46875,1,0.5,1,0.5,0,0.0
4,-2324495172285268445,924243001,True,False,0,4,21,183703,0.023712,0.004795,...,1,1.652344,36,14.929688,1,0.5,0,0.0,1,0.5


In [59]:
# 同じ商品を何回購入したことがあるか
tmp_df = joined_feature.groupby(['customer_id', 'article_id']).size().reset_index(name='buy_same_before')
tmp_df['buy_same_before'].fillna(0).astype('int16')
joined_test = pd.merge(joined_test, tmp_df, on=['customer_id', 'article_id'], how='left')
joined_test['buy_same_before'] = joined_test['buy_same_before'].fillna(0).astype('int16')                
joined_test.head()

Unnamed: 0,customer_id,article_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,c_mean_price,c_std_price,...,a_mean_sales_channel_id,a_mean_age,a_std_age,ca_index_group_no_num,index_group_no_similarity,ca_index_code_num,index_code_similarity,ca_product_group_no_num,product_group_no_similarity,buy_same_before
0,-2324495172285268445,736530007,True,False,0,4,21,183703,0.023712,0.004795,...,1.712891,30,11.929688,1,0.5,1,0.5,1,0.5,1
1,-2324495172285268445,770703004,True,False,0,4,21,183703,0.023712,0.004795,...,1.763672,33,14.148438,1,0.5,1,0.5,1,0.5,1
2,-2324495172285268445,736531006,True,False,0,4,21,183703,0.023712,0.004795,...,1.696289,31,11.632812,1,0.5,1,0.5,1,0.5,0
3,-2324495172285268445,706016001,True,False,0,4,21,183703,0.023712,0.004795,...,1.845703,32,12.46875,1,0.5,1,0.5,0,0.0,0
4,-2324495172285268445,924243001,True,False,0,4,21,183703,0.023712,0.004795,...,1.652344,36,14.929688,1,0.5,0,0.0,1,0.5,0


In [60]:
# 直近1週間で同じ商品を直近何回購入したことがあるか
tmp_df = joined_recent.groupby(['customer_id', 'article_id']).size().reset_index(name='buy_same_before_recent')
joined_test = pd.merge(joined_test, tmp_df, on=['customer_id', 'article_id'], how='left')
joined_test['buy_same_before_recent'] = joined_test['buy_same_before_recent'].fillna(0).astype('int16')                
joined_test.head()

Unnamed: 0,customer_id,article_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,c_mean_price,c_std_price,...,a_mean_age,a_std_age,ca_index_group_no_num,index_group_no_similarity,ca_index_code_num,index_code_similarity,ca_product_group_no_num,product_group_no_similarity,buy_same_before,buy_same_before_recent
0,-2324495172285268445,736530007,True,False,0,4,21,183703,0.023712,0.004795,...,30,11.929688,1,0.5,1,0.5,1,0.5,1,0
1,-2324495172285268445,770703004,True,False,0,4,21,183703,0.023712,0.004795,...,33,14.148438,1,0.5,1,0.5,1,0.5,1,0
2,-2324495172285268445,736531006,True,False,0,4,21,183703,0.023712,0.004795,...,31,11.632812,1,0.5,1,0.5,1,0.5,0,0
3,-2324495172285268445,706016001,True,False,0,4,21,183703,0.023712,0.004795,...,32,12.46875,1,0.5,1,0.5,0,0.0,0,0
4,-2324495172285268445,924243001,True,False,0,4,21,183703,0.023712,0.004795,...,36,14.929688,1,0.5,0,0.0,1,0.5,0,0


In [61]:
# 最後の購入から7日以内に同じ商品を直近何回購入したことがあるか
tmp_df = joined_last_week.groupby(['customer_id', 'article_id']).size().reset_index(name='buy_same_last_week')
joined_test = pd.merge(joined_test, tmp_df, on=['customer_id', 'article_id'], how='left')
joined_test['buy_same_last_week'] = joined_test['buy_same_last_week'].fillna(0).astype('int16')                
joined_test.head()

Unnamed: 0,customer_id,article_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,c_mean_price,c_std_price,...,a_std_age,ca_index_group_no_num,index_group_no_similarity,ca_index_code_num,index_code_similarity,ca_product_group_no_num,product_group_no_similarity,buy_same_before,buy_same_before_recent,buy_same_last_week
0,-2324495172285268445,736530007,True,False,0,4,21,183703,0.023712,0.004795,...,11.929688,1,0.5,1,0.5,1,0.5,1,0,1
1,-2324495172285268445,770703004,True,False,0,4,21,183703,0.023712,0.004795,...,14.148438,1,0.5,1,0.5,1,0.5,1,0,1
2,-2324495172285268445,736531006,True,False,0,4,21,183703,0.023712,0.004795,...,11.632812,1,0.5,1,0.5,1,0.5,0,0,0
3,-2324495172285268445,706016001,True,False,0,4,21,183703,0.023712,0.004795,...,12.46875,1,0.5,1,0.5,0,0.0,0,0,0
4,-2324495172285268445,924243001,True,False,0,4,21,183703,0.023712,0.004795,...,14.929688,1,0.5,0,0.0,1,0.5,0,0,0


In [62]:
del tmp_df; gc.collect()

0

In [63]:
# delta_price
joined_test['delta_mean_price'] = (joined_test['c_mean_price'] - joined_test['a_mean_price']).astype('float16')
joined_test.head()

Unnamed: 0,customer_id,article_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,c_mean_price,c_std_price,...,ca_index_group_no_num,index_group_no_similarity,ca_index_code_num,index_code_similarity,ca_product_group_no_num,product_group_no_similarity,buy_same_before,buy_same_before_recent,buy_same_last_week,delta_mean_price
0,-2324495172285268445,736530007,True,False,0,4,21,183703,0.023712,0.004795,...,1,0.5,1,0.5,1,0.5,1,0,1,-0.000462
1,-2324495172285268445,770703004,True,False,0,4,21,183703,0.023712,0.004795,...,1,0.5,1,0.5,1,0.5,1,0,1,-0.005836
2,-2324495172285268445,736531006,True,False,0,4,21,183703,0.023712,0.004795,...,1,0.5,1,0.5,1,0.5,0,0,0,0.014046
3,-2324495172285268445,706016001,True,False,0,4,21,183703,0.023712,0.004795,...,1,0.5,1,0.5,0,0.0,0,0,0,-0.008736
4,-2324495172285268445,924243001,True,False,0,4,21,183703,0.023712,0.004795,...,1,0.5,0,0.0,1,0.5,0,0,0,-0.017792


In [64]:
# delta_age
joined_test['delta_mean_age'] = (joined_test['age'] - joined_test['a_mean_age']).astype('int8')
joined_test.head()

Unnamed: 0,customer_id,article_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,c_mean_price,c_std_price,...,index_group_no_similarity,ca_index_code_num,index_code_similarity,ca_product_group_no_num,product_group_no_similarity,buy_same_before,buy_same_before_recent,buy_same_last_week,delta_mean_price,delta_mean_age
0,-2324495172285268445,736530007,True,False,0,4,21,183703,0.023712,0.004795,...,0.5,1,0.5,1,0.5,1,0,1,-0.000462,-9
1,-2324495172285268445,770703004,True,False,0,4,21,183703,0.023712,0.004795,...,0.5,1,0.5,1,0.5,1,0,1,-0.005836,-12
2,-2324495172285268445,736531006,True,False,0,4,21,183703,0.023712,0.004795,...,0.5,1,0.5,1,0.5,0,0,0,0.014046,-10
3,-2324495172285268445,706016001,True,False,0,4,21,183703,0.023712,0.004795,...,0.5,1,0.5,0,0.0,0,0,0,-0.008736,-11
4,-2324495172285268445,924243001,True,False,0,4,21,183703,0.023712,0.004795,...,0.5,0,0.0,1,0.5,0,0,0,-0.017792,-15


In [65]:
# delta_mean_sales_channel_id
joined_test['delta_mean_sales_channel_id'] = (joined_test['c_mean_sales_channel_id'] - joined_test['a_mean_sales_channel_id']).astype('float16')
joined_test.head()

Unnamed: 0,customer_id,article_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,c_mean_price,c_std_price,...,ca_index_code_num,index_code_similarity,ca_product_group_no_num,product_group_no_similarity,buy_same_before,buy_same_before_recent,buy_same_last_week,delta_mean_price,delta_mean_age,delta_mean_sales_channel_id
0,-2324495172285268445,736530007,True,False,0,4,21,183703,0.023712,0.004795,...,1,0.5,1,0.5,1,0,1,-0.000462,-9,0.287109
1,-2324495172285268445,770703004,True,False,0,4,21,183703,0.023712,0.004795,...,1,0.5,1,0.5,1,0,1,-0.005836,-12,0.236328
2,-2324495172285268445,736531006,True,False,0,4,21,183703,0.023712,0.004795,...,1,0.5,1,0.5,0,0,0,0.014046,-10,0.303711
3,-2324495172285268445,706016001,True,False,0,4,21,183703,0.023712,0.004795,...,1,0.5,0,0.0,0,0,0,-0.008736,-11,0.154297
4,-2324495172285268445,924243001,True,False,0,4,21,183703,0.023712,0.004795,...,0,0.0,1,0.5,0,0,0,-0.017792,-15,0.347656


In [66]:
# delta_mean_is_for_male
joined_test['delta_mean_is_for_male'] = (joined_test['c_mean_is_for_male'] - joined_test['is_for_male']).astype('float16')
joined_test.head()

Unnamed: 0,customer_id,article_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,c_mean_price,c_std_price,...,index_code_similarity,ca_product_group_no_num,product_group_no_similarity,buy_same_before,buy_same_before_recent,buy_same_last_week,delta_mean_price,delta_mean_age,delta_mean_sales_channel_id,delta_mean_is_for_male
0,-2324495172285268445,736530007,True,False,0,4,21,183703,0.023712,0.004795,...,0.5,1,0.5,1,0,1,-0.000462,-9,0.287109,0.0
1,-2324495172285268445,770703004,True,False,0,4,21,183703,0.023712,0.004795,...,0.5,1,0.5,1,0,1,-0.005836,-12,0.236328,0.0
2,-2324495172285268445,736531006,True,False,0,4,21,183703,0.023712,0.004795,...,0.5,1,0.5,0,0,0,0.014046,-10,0.303711,0.0
3,-2324495172285268445,706016001,True,False,0,4,21,183703,0.023712,0.004795,...,0.5,0,0.0,0,0,0,-0.008736,-11,0.154297,0.0
4,-2324495172285268445,924243001,True,False,0,4,21,183703,0.023712,0.004795,...,0.0,1,0.5,0,0,0,-0.017792,-15,0.347656,0.0


In [67]:
# delta_mean_is_for_male
joined_test['delta_mean_is_for_female'] = (joined_test['c_mean_is_for_female'] - joined_test['is_for_female']).astype('float16')
joined_test.head()

Unnamed: 0,customer_id,article_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,c_mean_price,c_std_price,...,ca_product_group_no_num,product_group_no_similarity,buy_same_before,buy_same_before_recent,buy_same_last_week,delta_mean_price,delta_mean_age,delta_mean_sales_channel_id,delta_mean_is_for_male,delta_mean_is_for_female
0,-2324495172285268445,736530007,True,False,0,4,21,183703,0.023712,0.004795,...,1,0.5,1,0,1,-0.000462,-9,0.287109,0.0,0.0
1,-2324495172285268445,770703004,True,False,0,4,21,183703,0.023712,0.004795,...,1,0.5,1,0,1,-0.005836,-12,0.236328,0.0,0.0
2,-2324495172285268445,736531006,True,False,0,4,21,183703,0.023712,0.004795,...,1,0.5,0,0,0,0.014046,-10,0.303711,0.0,0.0
3,-2324495172285268445,706016001,True,False,0,4,21,183703,0.023712,0.004795,...,0,0.0,0,0,0,-0.008736,-11,0.154297,0.0,0.0
4,-2324495172285268445,924243001,True,False,0,4,21,183703,0.023712,0.004795,...,1,0.5,0,0,0,-0.017792,-15,0.347656,0.0,0.0


In [68]:
# delta_mean_is_for_mama
joined_test['delta_mean_is_for_mama'] = (joined_test['c_mean_is_for_mama'] - joined_test['is_for_mama']).astype('float16')
joined_test.head()

Unnamed: 0,customer_id,article_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,c_mean_price,c_std_price,...,product_group_no_similarity,buy_same_before,buy_same_before_recent,buy_same_last_week,delta_mean_price,delta_mean_age,delta_mean_sales_channel_id,delta_mean_is_for_male,delta_mean_is_for_female,delta_mean_is_for_mama
0,-2324495172285268445,736530007,True,False,0,4,21,183703,0.023712,0.004795,...,0.5,1,0,1,-0.000462,-9,0.287109,0.0,0.0,0.0
1,-2324495172285268445,770703004,True,False,0,4,21,183703,0.023712,0.004795,...,0.5,1,0,1,-0.005836,-12,0.236328,0.0,0.0,0.0
2,-2324495172285268445,736531006,True,False,0,4,21,183703,0.023712,0.004795,...,0.5,0,0,0,0.014046,-10,0.303711,0.0,0.0,0.0
3,-2324495172285268445,706016001,True,False,0,4,21,183703,0.023712,0.004795,...,0.0,0,0,0,-0.008736,-11,0.154297,0.0,0.0,0.0
4,-2324495172285268445,924243001,True,False,0,4,21,183703,0.023712,0.004795,...,0.5,0,0,0,-0.017792,-15,0.347656,0.0,0.0,0.0


In [69]:
del joined_feature, joined_recent; gc.collect()

0

## Save

In [70]:
joined_test.to_csv(f'test_{Nval}.csv', index=False)
del joined_test; gc.collect()

0