# candidates用の特徴量エンジニアリング

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm


from sklearn.model_selection import KFold
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import StratifiedKFold
from nyaggle.feature.category_encoder import TargetEncoder

from nyaggle.experiment import run_experiment

#最大表示列数の指定（ここでは50列を指定）
pd.set_option('display.max_columns', 50)
# 最大表示行数の指定（ここでは50行を指定）
pd.set_option('display.max_rows', 500)

In [2]:
train_log_df = pd.read_csv('../train_log.csv')
test_log_df = pd.read_csv('../test_log.csv')
train_label_df = pd.read_csv('../train_label.csv')
test_session_df = pd.read_csv('../test_session.csv')
yado_df = pd.read_csv('../yado.csv')
# image_embeddings_df = pd.read_parquet('../image_embeddings.parquet')
sample_submission_df = pd.read_csv('../sample_submission.csv')

candidate_train_df = pd.read_csv('../data/candidate_ver16_train.csv')
candidate_test_df = pd.read_csv('../data/candidate_ver16_test.csv')

#### Trainに正例と負例のフラグを付与

In [3]:
# 学習用データ
# 教師データに正例と負例のフラグを付与
train_target = []

# 各セッションに対して候補の宿をチェック
for index, row in tqdm(train_label_df.iterrows(), total=train_label_df.shape[0]):
    session_id = row['session_id']
    yad_no = row['yad_no']
    
    # candidate_ver7_train.csv の該当行を取得（同じインデックスを仮定）
    candidate_row = candidate_train_df.iloc[index]

    # 50個の購入候補の宿に対してチェック
    for col in candidate_train_df.columns:
        yado_no = candidate_row[col]
        is_correct = (yado_no == yad_no)  # 一致するかどうかのチェック
        train_target.append([session_id, yado_no, is_correct])

# 結果をDataFrameに変換
train_df = pd.DataFrame(train_target, columns=['session_id', 'yado_no', 'target']) #区別するためにあえてyad_noではなくyado_noとしている
train_df['target'] = train_df['target'].astype(int)  # target列をint型に変換

# 最初の数行を表示して確認
print(train_df.head())

print('正例と負例の数を確認')
print(train_df['target'].value_counts())

100%|██████████| 288698/288698 [00:53<00:00, 5381.85it/s]


                         session_id  yado_no  target
0  000007603d533d30453cc45d0f3d119f    11882       0
1  000007603d533d30453cc45d0f3d119f     2808       0
2  000007603d533d30453cc45d0f3d119f     5289       0
3  000007603d533d30453cc45d0f3d119f     4101       1
4  000007603d533d30453cc45d0f3d119f     3324       0
正例と負例の数を確認
0    5541315
1     232645
Name: target, dtype: int64


#### 推論用データの作成

In [4]:
# 推論用データにダミーのyado_noを付与
test_session_df['yad_no'] = -1

test_target = []

# 各セッションに対して候補の宿をチェック
for index, row in tqdm(test_session_df.iterrows(), total=test_session_df.shape[0]):
    session_id = row['session_id']
    yad_no = row['yad_no']
    
    # candidate_ver7_test.csv の該当行を取得（同じインデックスを仮定）
    candidate_row = candidate_test_df.iloc[index]

    # 50個の購入候補の宿に対してチェック
    for col in candidate_test_df.columns:
        yado_no = candidate_row[col]
        is_correct = (yado_no == yad_no)  # 一致するかどうかのチェック
        test_target.append([session_id, yado_no, is_correct])

# 結果をDataFrameに変換
test_df = pd.DataFrame(test_target, columns=['session_id', 'yado_no', 'target']) #区別するためにあえてyad_noではなくyado_noとしている
test_df['target'] = test_df['target'].astype(int)  # target列をint型に変換

# 最初の数行を表示して確認
print(test_df.head())

print('正例と負例の数を確認')
print(test_df['target'].value_counts()) # すべて0になるはず

100%|██████████| 174700/174700 [00:31<00:00, 5496.66it/s]


                         session_id  yado_no  target
0  00001149e9c73985425197104712478c     3560       0
1  00001149e9c73985425197104712478c    11561       0
2  00001149e9c73985425197104712478c     4714       0
3  00001149e9c73985425197104712478c     2680       0
4  00001149e9c73985425197104712478c     4420       0
正例と負例の数を確認
0    3494000
Name: target, dtype: int64


#### seq_noを追加

In [5]:
# seq_noをマージする

# train_log の session_id と yad_no の組み合わせに基づいて辞書を作成
seq_no_dict_train = dict(zip(zip(train_log_df['session_id'], train_log_df['yad_no']), train_log_df['seq_no']))
# train_df に seq_no 列を追加（tqdm で進捗表示）
train_df['seq_no'] = [seq_no_dict_train.get((row['session_id'], row['yado_no']), -1) for row in tqdm(train_df.to_dict('records'))]
# 結果の確認
print(train_df.head())


# test_log の session_id と yad_no の組み合わせに基づいて辞書を作成
seq_no_dict_test = dict(zip(zip(test_log_df['session_id'], test_log_df['yad_no']), test_log_df['seq_no']))
# test_df に seq_no 列を追加（tqdm で進捗表示）
test_df['seq_no'] = [seq_no_dict_test.get((row['session_id'], row['yado_no']), -1) for row in tqdm(test_df.to_dict('records'))]
# 結果の確認
print(test_df.head())

100%|██████████| 5773960/5773960 [00:02<00:00, 2452432.70it/s]


                         session_id  yado_no  target  seq_no
0  000007603d533d30453cc45d0f3d119f    11882       0      -1
1  000007603d533d30453cc45d0f3d119f     2808       0      -1
2  000007603d533d30453cc45d0f3d119f     5289       0      -1
3  000007603d533d30453cc45d0f3d119f     4101       1      -1
4  000007603d533d30453cc45d0f3d119f     3324       0      -1


100%|██████████| 3494000/3494000 [00:01<00:00, 2659640.46it/s]


                         session_id  yado_no  target  seq_no
0  00001149e9c73985425197104712478c     3560       0       0
1  00001149e9c73985425197104712478c    11561       0      -1
2  00001149e9c73985425197104712478c     4714       0      -1
3  00001149e9c73985425197104712478c     2680       0      -1
4  00001149e9c73985425197104712478c     4420       0      -1


In [6]:
train_df.head()

Unnamed: 0,session_id,yado_no,target,seq_no
0,000007603d533d30453cc45d0f3d119f,11882,0,-1
1,000007603d533d30453cc45d0f3d119f,2808,0,-1
2,000007603d533d30453cc45d0f3d119f,5289,0,-1
3,000007603d533d30453cc45d0f3d119f,4101,1,-1
4,000007603d533d30453cc45d0f3d119f,3324,0,-1


In [7]:
train_df[train_df['session_id'] == '0007dd71a9a78c567084374a66e38139']

Unnamed: 0,session_id,yado_no,target,seq_no
720,0007dd71a9a78c567084374a66e38139,2927,1,4
721,0007dd71a9a78c567084374a66e38139,6199,0,-1
722,0007dd71a9a78c567084374a66e38139,12089,0,-1
723,0007dd71a9a78c567084374a66e38139,12425,0,-1
724,0007dd71a9a78c567084374a66e38139,13386,0,-1
725,0007dd71a9a78c567084374a66e38139,11850,0,-1
726,0007dd71a9a78c567084374a66e38139,9137,0,-1
727,0007dd71a9a78c567084374a66e38139,12986,0,-1
728,0007dd71a9a78c567084374a66e38139,2452,0,-1
729,0007dd71a9a78c567084374a66e38139,2318,0,-1


In [8]:
train_label_df[train_label_df['session_id'] == '0007dd71a9a78c567084374a66e38139']

Unnamed: 0,session_id,yad_no
36,0007dd71a9a78c567084374a66e38139,2927


In [9]:
# 複数あると一番番号が高い4の判定になる
train_log_df[train_log_df['session_id'] == '0007dd71a9a78c567084374a66e38139']

Unnamed: 0,session_id,seq_no,yad_no
45,0007dd71a9a78c567084374a66e38139,0,2927
46,0007dd71a9a78c567084374a66e38139,1,11037
47,0007dd71a9a78c567084374a66e38139,2,2927
48,0007dd71a9a78c567084374a66e38139,3,11037
49,0007dd71a9a78c567084374a66e38139,4,2927
50,0007dd71a9a78c567084374a66e38139,5,11037


In [10]:
#　※複数のseq_noがある場合は、0も混入する
print(train_df['seq_no'].value_counts())
print(test_df['seq_no'].value_counts())

-1    5664507
 0      88128
 1      16052
 2       4125
 3        837
 4        223
 5         65
 6         18
 7          4
 8          1
Name: seq_no, dtype: int64
-1    3430604
 0      51884
 1       8692
 2       2239
 3        428
 4        124
 5         22
 6          7
Name: seq_no, dtype: int64


#### 候補の宿がsession中に閲覧されていたかどうか

In [11]:
# 'seq_no'が−1でなければ1のフラグを立てる
train_df['seen_yad'] = train_df['seq_no'].apply(lambda x: 1 if x != -1 else 0)
test_df['seen_yad'] = test_df['seq_no'].apply(lambda x: 1 if x != -1 else 0)

In [12]:
print(train_df['seen_yad'].value_counts())  
print(test_df['seen_yad'].value_counts())

0    5664507
1     109453
Name: seen_yad, dtype: int64
0    3430604
1      63396
Name: seen_yad, dtype: int64


In [13]:
train_df[train_df['seen_yad']==1].target.value_counts()

1    87693
0    21760
Name: target, dtype: int64

#### 各sessionにおける最大seq_no

In [14]:
# 各セッションの seq_no の最大値を計算
max_seq_no_train = train_log_df.groupby('session_id')['seq_no'].max().reset_index()
max_seq_no_train.rename(columns={'seq_no': 'max_seq_no'}, inplace=True)
# teacher_df と max_seq_no のマージ
train_df = train_df.merge(max_seq_no_train, on='session_id', how='left')
# 結果の確認
print(train_df.head())


# 各セッションの seq_no の最大値を計算
max_seq_no_test = test_log_df.groupby('session_id')['seq_no'].max().reset_index()
max_seq_no_test.rename(columns={'seq_no': 'max_seq_no'}, inplace=True)
# teacher_df と max_seq_no のマージ
test_df = test_df.merge(max_seq_no_test, on='session_id', how='left')
# 結果の確認
print(test_df.head())

                         session_id  yado_no  target  seq_no  seen_yad  \
0  000007603d533d30453cc45d0f3d119f    11882       0      -1         0   
1  000007603d533d30453cc45d0f3d119f     2808       0      -1         0   
2  000007603d533d30453cc45d0f3d119f     5289       0      -1         0   
3  000007603d533d30453cc45d0f3d119f     4101       1      -1         0   
4  000007603d533d30453cc45d0f3d119f     3324       0      -1         0   

   max_seq_no  
0           0  
1           0  
2           0  
3           0  
4           0  
                         session_id  yado_no  target  seq_no  seen_yad  \
0  00001149e9c73985425197104712478c     3560       0       0         1   
1  00001149e9c73985425197104712478c    11561       0      -1         0   
2  00001149e9c73985425197104712478c     4714       0      -1         0   
3  00001149e9c73985425197104712478c     2680       0      -1         0   
4  00001149e9c73985425197104712478c     4420       0      -1         0   

   max_seq_no 

In [15]:
print(train_df['max_seq_no'].value_counts())
print(test_df['max_seq_no'].value_counts())

0    3707720
1    1655860
2     307000
3      80500
4      16660
5       4460
6       1300
7        360
8         80
9         20
Name: max_seq_no, dtype: int64
0    2278800
1     989860
2     169180
3      44540
4       8560
5       2480
6        440
7        140
Name: max_seq_no, dtype: int64


#### 差分の考慮：max_seq_noから(-1以外の要素)でseq_noを引く

In [16]:
# 最初に全ての diff_seq_no を -1 に設定
train_df['diff_seq_no'] = -1
test_df['diff_seq_no'] = -1

# seq_no が -1 以外の行にのみ max_seq_no - seq_no の計算を適用
train_df.loc[train_df['seq_no'] != -1, 'diff_seq_no'] = train_df['max_seq_no'] - train_df['seq_no']
test_df.loc[test_df['seq_no'] != -1, 'diff_seq_no'] = test_df['max_seq_no'] - test_df['seq_no']

# 結果の確認
print(train_df.head())
print(test_df.head())

                         session_id  yado_no  target  seq_no  seen_yad  \
0  000007603d533d30453cc45d0f3d119f    11882       0      -1         0   
1  000007603d533d30453cc45d0f3d119f     2808       0      -1         0   
2  000007603d533d30453cc45d0f3d119f     5289       0      -1         0   
3  000007603d533d30453cc45d0f3d119f     4101       1      -1         0   
4  000007603d533d30453cc45d0f3d119f     3324       0      -1         0   

   max_seq_no  diff_seq_no  
0           0           -1  
1           0           -1  
2           0           -1  
3           0           -1  
4           0           -1  
                         session_id  yado_no  target  seq_no  seen_yad  \
0  00001149e9c73985425197104712478c     3560       0       0         1   
1  00001149e9c73985425197104712478c    11561       0      -1         0   
2  00001149e9c73985425197104712478c     4714       0      -1         0   
3  00001149e9c73985425197104712478c     2680       0      -1         0   
4  00001149

In [17]:
print(train_df['diff_seq_no'].value_counts())
print(test_df['diff_seq_no'].value_counts())

-1    5664507
 1     103312
 2       5684
 3        407
 4         49
 5          1
Name: diff_seq_no, dtype: int64
-1    3430604
 1      60760
 2       2528
 3        106
 4          2
Name: diff_seq_no, dtype: int64


#### diff_seq_no が奇数かどうかの判定 

In [18]:
# diff_seq_no が奇数かどうかの判定（seq_no が -1 の場合は除外）
train_df['is_odd'] = np.where(train_df['seq_no'] != -1, train_df['diff_seq_no'] % 2 == 1, -1)
test_df['is_odd'] = np.where(test_df['seq_no'] != -1, test_df['diff_seq_no'] % 2 == 1, -1)

# seq_no が -1 以外の場合、Trueを0、Falseを1に変換（すでに -1 の場合は変更しない）
train_df['is_odd'] = np.where(train_df['is_odd'] != -1, np.where(train_df['is_odd'], 0, 1), -1)
test_df['is_odd'] = np.where(test_df['is_odd'] != -1, np.where(test_df['is_odd'], 0, 1), -1)

# 結果の確認
print(train_df.head())
print(test_df.head())

                         session_id  yado_no  target  seq_no  seen_yad  \
0  000007603d533d30453cc45d0f3d119f    11882       0      -1         0   
1  000007603d533d30453cc45d0f3d119f     2808       0      -1         0   
2  000007603d533d30453cc45d0f3d119f     5289       0      -1         0   
3  000007603d533d30453cc45d0f3d119f     4101       1      -1         0   
4  000007603d533d30453cc45d0f3d119f     3324       0      -1         0   

   max_seq_no  diff_seq_no  is_odd  
0           0           -1      -1  
1           0           -1      -1  
2           0           -1      -1  
3           0           -1      -1  
4           0           -1      -1  
                         session_id  yado_no  target  seq_no  seen_yad  \
0  00001149e9c73985425197104712478c     3560       0       0         1   
1  00001149e9c73985425197104712478c    11561       0      -1         0   
2  00001149e9c73985425197104712478c     4714       0      -1         0   
3  00001149e9c73985425197104712478c  

In [19]:
print(train_df['is_odd'].value_counts())
print(test_df['is_odd'].value_counts())

-1    5664507
 0     103720
 1       5733
Name: is_odd, dtype: int64
-1    3430604
 0      60866
 1       2530
Name: is_odd, dtype: int64


#### 各sessionにおいて2回以上出現したyad_noがあれば1のフラグを立てる

In [20]:
# 各セッションにおいて2回以上登場する yad_no を特定
yad_no_counts = train_log_df.groupby(['session_id', 'yad_no']).size().reset_index(name='count')
multiple_visits = yad_no_counts[yad_no_counts['count'] > 1]

# (session_id, yad_no) のタプルをキーとして辞書を作成
multiple_visits_dict = {(row['session_id'], row['yad_no']): 1 for _, row in multiple_visits.iterrows()}

# teacher_df に multiple_visits_flag 列を追加（tqdm で進捗表示）
train_df['multiple_visits_flag'] = [multiple_visits_dict.get((row['session_id'], row['yado_no']), 0) 
                                      for row in tqdm(train_df.to_dict('records'))]

# 結果の確認
print(train_df.head())



# 各セッションにおいて2回以上登場する yad_no を特定
yad_no_counts = test_log_df.groupby(['session_id', 'yad_no']).size().reset_index(name='count')
multiple_visits = yad_no_counts[yad_no_counts['count'] > 1]

# (session_id, yad_no) のタプルをキーとして辞書を作成
multiple_visits_dict = {(row['session_id'], row['yad_no']): 1 for _, row in multiple_visits.iterrows()}

# teacher_df に multiple_visits_flag 列を追加（tqdm で進捗表示）
test_df['multiple_visits_flag'] = [multiple_visits_dict.get((row['session_id'], row['yado_no']), 0) 
                                      for row in tqdm(test_df.to_dict('records'))]

# 結果の確認
print(test_df.head())

100%|██████████| 5773960/5773960 [00:01<00:00, 2951919.64it/s]


                         session_id  yado_no  target  seq_no  seen_yad  \
0  000007603d533d30453cc45d0f3d119f    11882       0      -1         0   
1  000007603d533d30453cc45d0f3d119f     2808       0      -1         0   
2  000007603d533d30453cc45d0f3d119f     5289       0      -1         0   
3  000007603d533d30453cc45d0f3d119f     4101       1      -1         0   
4  000007603d533d30453cc45d0f3d119f     3324       0      -1         0   

   max_seq_no  diff_seq_no  is_odd  multiple_visits_flag  
0           0           -1      -1                     0  
1           0           -1      -1                     0  
2           0           -1      -1                     0  
3           0           -1      -1                     0  
4           0           -1      -1                     0  


100%|██████████| 3494000/3494000 [00:01<00:00, 3068097.19it/s]


                         session_id  yado_no  target  seq_no  seen_yad  \
0  00001149e9c73985425197104712478c     3560       0       0         1   
1  00001149e9c73985425197104712478c    11561       0      -1         0   
2  00001149e9c73985425197104712478c     4714       0      -1         0   
3  00001149e9c73985425197104712478c     2680       0      -1         0   
4  00001149e9c73985425197104712478c     4420       0      -1         0   

   max_seq_no  diff_seq_no  is_odd  multiple_visits_flag  
0           1            1       0                     0  
1           1           -1      -1                     0  
2           1           -1      -1                     0  
3           1           -1      -1                     0  
4           1           -1      -1                     0  


In [22]:
print(train_df['multiple_visits_flag'].value_counts())
print(test_df['multiple_visits_flag'].value_counts())

0    5769312
1       4648
Name: multiple_visits_flag, dtype: int64
0    3491359
1       2641
Name: multiple_visits_flag, dtype: int64


### yado_dfに関する特徴量エンジニアリング

In [23]:
yado_df

Unnamed: 0,yad_no,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd
0,1,0,129.0,1.0,0,1.0,,,1.0,f0112abf369fb03cdc5f5309300913da,072c85e1653e10c9c7dd065ad007125a,449c52ef581d5f9ef311189469a0520e,677a32689cd1ad74e867f1fbe43a3e1c
1,2,0,23.0,1.0,0,,,,,d86102dd9c232bade9a97dccad40df48,b4d2fb4e51ea7bca80eb1270aa474a54,5c9a8f48e9df0234da012747a02d4b29,4ee16ee838dd2703cc9a1d5a535f0ced
2,3,0,167.0,1.0,1,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,572d60f0f5212aacda515ebf81fb0a3a,8a623b960557e87bd1f4edf71b6255be,ab9480fd72a44d51690ab16c4ad4d49c
3,4,0,144.0,1.0,0,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52c9ea83f2cfe92be54cb6bc961edf21,1cc3e1838bb0fd0fde0396130b1f82b9
4,5,0,41.0,1.0,1,,,,,43875109d1dab93592812c50d18270a7,75617bb07a2785a948ab1958909211f1,9ea5a911019b66ccd42f556c42a2fe2f,be1b876af18afc4deeb3081591d2a910
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13801,13802,0,10.0,1.0,1,,,,,c312e07b7a5d456d53a5b00910a336e1,558ac1909f0318b82c621ab250329d6d,80fb3c5ad0c89931d0923e9f80885218,5eb30820716082c720836733d73c605e
13802,13803,0,,,0,1.0,,,1.0,dc414a17890cfc17d011d5038b88ca93,d78f53d0856617bc782f02c3280dfef2,e5cfcc0a43c82072aca11628ff0add53,20ad8785a30f125bee5a8a325782ab06
13803,13804,0,80.0,1.0,1,,1.0,,1.0,d86102dd9c232bade9a97dccad40df48,7d76599bd27ff9e7823b2b1323ca763e,c5fe8848b6ab39b040cdb3668aea9433,b3eab50ccf6ffb51c37d36ee384abfbf
13804,13805,0,8.0,1.0,1,,,,1.0,3300cf6f774b7c6a5807110f244cbc21,689cf8289e7ea0b2eef1b017dcdfe8de,8b712435430a6875839a6c3b5a40b008,2b4165444a777465576b25f65697d739


In [24]:
# train_log_df と test_log_df を結合
log_df = pd.concat([train_log_df, test_log_df], axis=0, ignore_index=True)
# yado_df と結合
merged_df = log_df.merge(yado_df, on='yad_no', how='left')


# 各宿の人気度を計算（例：訪問回数で計算）
yad_popularity = merged_df['yad_no'].value_counts().reset_index()
yad_popularity.columns = ['yad_no', 'popularity']

# 全体での人気度ランキング
yad_popularity['overall_rank'] = yad_popularity['popularity'].rank(ascending=False)
# yado_df に人気度をマージ
yado_df = yado_df.merge(yad_popularity, on='yad_no', how='left')
# エリアごとの人気度ランキングを計算
for area in ['wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd']:
    yado_df[f'{area}_rank'] = yado_df.groupby(area)['popularity'].rank(ascending=False, method='min')

# 結果の確認
print(yado_df.head())

   yad_no  yad_type  total_room_cnt  wireless_lan_flg  onsen_flg  kd_stn_5min  \
0       1         0           129.0               1.0          0          1.0   
1       2         0            23.0               1.0          0          NaN   
2       3         0           167.0               1.0          1          1.0   
3       4         0           144.0               1.0          0          1.0   
4       5         0            41.0               1.0          1          NaN   

   kd_bch_5min  kd_slp_5min  kd_conv_walk_5min  \
0          NaN          NaN                1.0   
1          NaN          NaN                NaN   
2          NaN          NaN                1.0   
3          NaN          NaN                1.0   
4          NaN          NaN                NaN   

                             wid_cd                            ken_cd  \
0  f0112abf369fb03cdc5f5309300913da  072c85e1653e10c9c7dd065ad007125a   
1  d86102dd9c232bade9a97dccad40df48  b4d2fb4e51ea7bca80eb1270aa474

### seen_yado == 0のみに絞る

In [25]:
# train_dfとtest_dfからseen_yadが1の行のみ抽出
train_df = train_df[train_df['seen_yad'] == 0].reset_index(drop=True)
test_df = test_df[test_df['seen_yad'] == 0].reset_index(drop=True) 

In [26]:
train_df

Unnamed: 0,session_id,yado_no,target,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag
0,000007603d533d30453cc45d0f3d119f,11882,0,-1,0,0,-1,-1,0
1,000007603d533d30453cc45d0f3d119f,2808,0,-1,0,0,-1,-1,0
2,000007603d533d30453cc45d0f3d119f,5289,0,-1,0,0,-1,-1,0
3,000007603d533d30453cc45d0f3d119f,4101,1,-1,0,0,-1,-1,0
4,000007603d533d30453cc45d0f3d119f,3324,0,-1,0,0,-1,-1,0
...,...,...,...,...,...,...,...,...,...
5664502,fffffa7baf370083ebcdd98f26a7e31a,11919,0,-1,0,1,-1,-1,0
5664503,fffffa7baf370083ebcdd98f26a7e31a,11984,0,-1,0,1,-1,-1,0
5664504,fffffa7baf370083ebcdd98f26a7e31a,2981,0,-1,0,1,-1,-1,0
5664505,fffffa7baf370083ebcdd98f26a7e31a,8609,0,-1,0,1,-1,-1,0


In [27]:
test_df

Unnamed: 0,session_id,yado_no,target,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag
0,00001149e9c73985425197104712478c,11561,0,-1,0,1,-1,-1,0
1,00001149e9c73985425197104712478c,4714,0,-1,0,1,-1,-1,0
2,00001149e9c73985425197104712478c,2680,0,-1,0,1,-1,-1,0
3,00001149e9c73985425197104712478c,4420,0,-1,0,1,-1,-1,0
4,00001149e9c73985425197104712478c,5466,0,-1,0,1,-1,-1,0
...,...,...,...,...,...,...,...,...,...
3430599,ffffe984aafd6127ce8e43e3ca40c79d,385,0,-1,0,0,-1,-1,0
3430600,ffffe984aafd6127ce8e43e3ca40c79d,7690,0,-1,0,0,-1,-1,0
3430601,ffffe984aafd6127ce8e43e3ca40c79d,6091,0,-1,0,0,-1,-1,0
3430602,ffffe984aafd6127ce8e43e3ca40c79d,9631,0,-1,0,0,-1,-1,0


In [28]:
#　train_dfとtest_dfにyado_dfをマージ
train_df = pd.merge(train_df, yado_df.rename(columns={'yad_no': 'yado_no'}), on='yado_no', how='left')
test_df = pd.merge(test_df, yado_df.rename(columns={'yad_no': 'yado_no'}), on='yado_no', how='left')

In [29]:
train_df

Unnamed: 0,session_id,yado_no,target,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd,popularity,overall_rank,wid_cd_rank,ken_cd_rank,lrg_cd_rank,sml_cd_rank
0,000007603d533d30453cc45d0f3d119f,11882,0,-1,0,0,-1,-1,0,0,113.0,1.0,0,1.0,,,1.0,dc414a17890cfc17d011d5038b88ca93,d78f53d0856617bc782f02c3280dfef2,4fd631b15116098340cdb099c86a5a40,4044dac1931ddaa5a967e09506d76343,23.0,7189.5,891.0,286.0,25.0,21.0
1,000007603d533d30453cc45d0f3d119f,2808,0,-1,0,0,-1,-1,0,0,128.0,1.0,0,1.0,,,,dc414a17890cfc17d011d5038b88ca93,d78f53d0856617bc782f02c3280dfef2,4fd631b15116098340cdb099c86a5a40,4044dac1931ddaa5a967e09506d76343,36.0,5594.5,687.0,229.0,17.0,17.0
2,000007603d533d30453cc45d0f3d119f,5289,0,-1,0,0,-1,-1,0,0,66.0,1.0,0,1.0,,,1.0,dc414a17890cfc17d011d5038b88ca93,d78f53d0856617bc782f02c3280dfef2,4fd631b15116098340cdb099c86a5a40,4044dac1931ddaa5a967e09506d76343,14.0,8690.5,1073.0,334.0,37.0,33.0
3,000007603d533d30453cc45d0f3d119f,4101,1,-1,0,0,-1,-1,0,0,39.0,,0,,,,1.0,dc414a17890cfc17d011d5038b88ca93,d78f53d0856617bc782f02c3280dfef2,4fd631b15116098340cdb099c86a5a40,4044dac1931ddaa5a967e09506d76343,24.0,7045.0,874.0,280.0,23.0,19.0
4,000007603d533d30453cc45d0f3d119f,3324,0,-1,0,0,-1,-1,0,0,53.0,1.0,0,,,,1.0,dc414a17890cfc17d011d5038b88ca93,d78f53d0856617bc782f02c3280dfef2,4fd631b15116098340cdb099c86a5a40,4044dac1931ddaa5a967e09506d76343,15.0,8495.0,1053.0,330.0,35.0,31.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5664502,fffffa7baf370083ebcdd98f26a7e31a,11919,0,-1,0,1,-1,-1,0,0,71.0,1.0,0,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,572d60f0f5212aacda515ebf81fb0a3a,8a623b960557e87bd1f4edf71b6255be,1d9f09b9e2bd43cebc9885a46388739a,214.0,353.5,116.0,31.0,30.0,24.0
5664503,fffffa7baf370083ebcdd98f26a7e31a,11984,0,-1,0,1,-1,-1,0,0,224.0,1.0,0,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,572d60f0f5212aacda515ebf81fb0a3a,2e63024b11908f3729510051a6fc7d9e,d075eb4a9669452b8f07cfc0d13a03ab,161.0,714.5,196.0,44.0,4.0,4.0
5664504,fffffa7baf370083ebcdd98f26a7e31a,2981,0,-1,0,1,-1,-1,0,0,180.0,,0,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,572d60f0f5212aacda515ebf81fb0a3a,8a623b960557e87bd1f4edf71b6255be,ab9480fd72a44d51690ab16c4ad4d49c,41.0,5121.5,1094.0,166.0,60.0,13.0
5664505,fffffa7baf370083ebcdd98f26a7e31a,8609,0,-1,0,1,-1,-1,0,0,179.0,1.0,0,1.0,,,,46e33861f921c3e38b81998fbf283f01,572d60f0f5212aacda515ebf81fb0a3a,8a623b960557e87bd1f4edf71b6255be,ab9480fd72a44d51690ab16c4ad4d49c,28.0,6481.5,1316.0,195.0,67.0,17.0


#### 各種Encoding

In [30]:
# カテゴリ変数と数値変数の明確化
cat_cols = ['wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd']

# 欠損値を-1で埋める
train_df.fillna(-1, inplace=True)
test_df.fillna(-1, inplace=True)

In [31]:
# Label Encoding

# testにしか存在しないものはNullにするので、trainのみをencoderにする
for col in cat_cols:
    encoder = {c: i for i, c in enumerate(train_df[col].unique())}
    train_df[f'label_{col}'] = train_df[col].map(encoder)
    test_df[f'label_{col}'] = test_df[col].map(encoder)

In [32]:
# Count Encoding（trainとtestをマージした方が良い？）←　した
# 増やした方が良い？

cat_cols = ['wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd'] + ['yado_no']

# testにしか存在しないものはNullにするので、trainのみをencoderにする
for col in cat_cols:
    encoder = train_df[col].value_counts()
    train_df[f'count_{col}'] = train_df[col].map(encoder)
    test_df[f'count_{col}'] = test_df[col].map(encoder)

In [33]:
# Target encoding
# 元の列順が保持されないから

cat_cols = ['wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd'] + ['yado_no']

# GroupKFold の設定  StratifiedGrouoKFoldでもいいかもしれない
gkf = GroupKFold(n_splits=5)

# fold 列を初期化
train_df['fold'] = -1

# 各 Fold に対してインデックスを割り当て
for fold, (train_idx, val_idx) in enumerate(gkf.split(train_df, groups=train_df['session_id'])):
    # バリデーションセットのインデックスに Fold 番号を割り当て
    train_df.loc[val_idx, 'fold'] = fold
# 結果の確認
print(train_df.head())



# TargetEncodingが完了した検証用データセットを格納するリスト
encoded_dfs = []

# すべての分割についてのループ
for fold in range(5):

  # 学習用と検証用データセットに分割する
  df_train = train_df[train_df.fold != fold].reset_index(drop=True)
  df_valid = train_df[train_df.fold == fold].reset_index(drop=True)
  
  # すべてのカテゴリについてのループ
  for column in cat_cols:
    # カテゴリごとの目的変数の統計量についての辞書を作成
    stats = df_train.groupby(column)['target'].agg(['mean', 'var'])
    
    # 平均、分散、最大値、最小値に対する列を作成
    for stat in ['mean', 'var']:
      mapping_dict = dict(stats[stat])
      df_valid.loc[:, f'TE_{column}_{stat}'] = df_valid[column].map(mapping_dict)

  # リストに格納
  encoded_dfs.append(df_valid)

# 結合したデータセットを返す
encoded_df = pd.concat(encoded_dfs, axis=0)


                         session_id  yado_no  target  seq_no  seen_yad  \
0  000007603d533d30453cc45d0f3d119f    11882       0      -1         0   
1  000007603d533d30453cc45d0f3d119f     2808       0      -1         0   
2  000007603d533d30453cc45d0f3d119f     5289       0      -1         0   
3  000007603d533d30453cc45d0f3d119f     4101       1      -1         0   
4  000007603d533d30453cc45d0f3d119f     3324       0      -1         0   

   max_seq_no  diff_seq_no  is_odd  multiple_visits_flag  yad_type  \
0           0           -1      -1                     0         0   
1           0           -1      -1                     0         0   
2           0           -1      -1                     0         0   
3           0           -1      -1                     0         0   
4           0           -1      -1                     0         0   

   total_room_cnt  wireless_lan_flg  onsen_flg  kd_stn_5min  kd_bch_5min  \
0           113.0               1.0          0          1.

In [34]:
train_df['fold'].value_counts()

0    1132908
1    1132908
4    1132907
3    1132892
2    1132892
Name: fold, dtype: int64

In [35]:
encoded_df['fold'].value_counts()

0    1132908
1    1132908
4    1132907
2    1132892
3    1132892
Name: fold, dtype: int64

In [36]:
train_df = encoded_df

In [37]:
# Testデータに対しても同様にTargetEncodingを行う
cat_cols = ['wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd'] + ['yado_no']

# すべてのカテゴリについてのループ
# train_df全体で計算する
for column in cat_cols:
    # カテゴリごとの目的変数の統計量についての辞書を作成
    stats = train_df.groupby(column)['target'].agg(['mean', 'var'])
    
    # 平均、分散、最大値、最小値に対する列を作成
    for stat in ['mean', 'var']:
        mapping_dict = dict(stats[stat])
        test_df.loc[:, f'TE_{column}_{stat}'] = test_df[column].map(mapping_dict)


In [38]:
test_df

Unnamed: 0,session_id,yado_no,target,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd,popularity,overall_rank,wid_cd_rank,ken_cd_rank,lrg_cd_rank,sml_cd_rank,label_wid_cd,label_ken_cd,label_lrg_cd,label_sml_cd,count_wid_cd,count_ken_cd,count_lrg_cd,count_sml_cd,count_yado_no,TE_wid_cd_mean,TE_wid_cd_var,TE_ken_cd_mean,TE_ken_cd_var,TE_lrg_cd_mean,TE_lrg_cd_var,TE_sml_cd_mean,TE_sml_cd_var,TE_yado_no_mean,TE_yado_no_var
0,00001149e9c73985425197104712478c,11561,0,-1,0,1,-1,-1,0,0,195.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,195.0,446.0,138.0,59.0,5.0,1.0,5,13,109,167,1347188,739858,43373,19575,1517.0,0.023444,0.022895,0.023417,0.022868,0.025131,0.024500,0.023755,0.023192,0.024390,0.023811
1,00001149e9c73985425197104712478c,4714,0,-1,0,1,-1,-1,0,0,58.0,1.0,0,-1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,83.0,2535.5,612.0,355.0,24.0,11.0,5,13,109,167,1347188,739858,43373,19575,812.0,0.023444,0.022895,0.023417,0.022868,0.025131,0.024500,0.023755,0.023192,0.023399,0.022880
2,00001149e9c73985425197104712478c,2680,0,-1,0,1,-1,-1,0,0,150.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,137.0,1056.0,282.0,140.0,11.0,4.0,5,13,109,167,1347188,739858,43373,19575,1320.0,0.023444,0.022895,0.023417,0.022868,0.025131,0.024500,0.023755,0.023192,0.032576,0.031538
3,00001149e9c73985425197104712478c,4420,0,-1,0,1,-1,-1,0,0,124.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,67.0,3274.0,760.0,444.0,29.0,16.0,5,13,109,167,1347188,739858,43373,19575,610.0,0.023444,0.022895,0.023417,0.022868,0.025131,0.024500,0.023755,0.023192,0.027869,0.027137
4,00001149e9c73985425197104712478c,5466,0,-1,0,1,-1,-1,0,0,130.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,126.0,1266.5,334.0,174.0,14.0,5.0,5,13,109,167,1347188,739858,43373,19575,1524.0,0.023444,0.022895,0.023417,0.022868,0.025131,0.024500,0.023755,0.023192,0.025591,0.024952
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3430599,ffffe984aafd6127ce8e43e3ca40c79d,385,0,-1,0,0,-1,-1,0,0,2384.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,3a6cd37aa9e38fd96d9dafc2615643d0,2ffd60eb648dbbaa1d1ba33644813c44,826.0,14.0,10.0,1.0,1.0,1.0,5,13,16,22,1347188,739858,116820,62449,6404.0,0.023444,0.022895,0.023417,0.022868,0.019072,0.018708,0.018911,0.018554,0.021861,0.021387
3430600,ffffe984aafd6127ce8e43e3ca40c79d,7690,0,-1,0,0,-1,-1,0,0,465.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,c9d5e891463e5389c42d16f987ed30bd,7cf2b4f31fb20747f89e58981e6d9fc1,107.0,1723.5,455.0,245.0,18.0,5.0,5,13,61,151,1347188,739858,68231,25883,1247.0,0.023444,0.022895,0.023417,0.022868,0.030411,0.029487,0.030638,0.029700,0.028067,0.027301
3430601,ffffe984aafd6127ce8e43e3ca40c79d,6091,0,-1,0,0,-1,-1,0,0,603.0,1.0,1,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,3a6cd37aa9e38fd96d9dafc2615643d0,4683b842facadc1ad7161f72220f6a3e,162.0,703.0,194.0,92.0,17.0,4.0,5,13,16,21,1347188,739858,116820,30110,1773.0,0.023444,0.022895,0.023417,0.022868,0.019072,0.018708,0.020359,0.019945,0.015228,0.015005
3430602,ffffe984aafd6127ce8e43e3ca40c79d,9631,0,-1,0,0,-1,-1,0,0,394.0,1.0,0,-1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52c9ea83f2cfe92be54cb6bc961edf21,1cc3e1838bb0fd0fde0396130b1f82b9,176.0,576.0,165.0,74.0,2.0,2.0,5,13,15,19,1347188,739858,21332,13168,1438.0,0.023444,0.022895,0.023417,0.022868,0.029346,0.028486,0.025440,0.024795,0.011822,0.011690


In [39]:
all_features = train_df.columns.to_list()
features= [x for x in all_features if x not in ('session_id', 'fold', 'target', 'wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd')]

In [40]:
train_df[features]

Unnamed: 0,yado_no,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,popularity,overall_rank,wid_cd_rank,ken_cd_rank,lrg_cd_rank,sml_cd_rank,label_wid_cd,label_ken_cd,label_lrg_cd,label_sml_cd,count_wid_cd,count_ken_cd,count_lrg_cd,count_sml_cd,count_yado_no,TE_wid_cd_mean,TE_wid_cd_var,TE_ken_cd_mean,TE_ken_cd_var,TE_lrg_cd_mean,TE_lrg_cd_var,TE_sml_cd_mean,TE_sml_cd_var,TE_yado_no_mean,TE_yado_no_var
0,3894,-1,0,1,-1,-1,0,0,205.0,1.0,0,-1.0,-1.0,-1.0,1.0,212.0,364.0,40.0,3.0,1.0,1.0,9,14,18,24,652167,69867,27167,26722,1089,0.025613,0.024957,0.033144,0.032046,0.031089,0.030124,0.030213,0.029302,0.034130,0.033002
1,7749,-1,0,1,-1,-1,0,0,76.0,1.0,0,-1.0,-1.0,-1.0,1.0,150.0,845.5,113.0,10.0,6.0,6.0,9,14,18,24,652167,69867,27167,26722,1291,0.025613,0.024957,0.033144,0.032046,0.031089,0.030124,0.030213,0.029302,0.033365,0.032283
2,902,-1,0,1,-1,-1,0,0,240.0,1.0,0,-1.0,-1.0,-1.0,1.0,154.0,785.5,103.0,9.0,5.0,5.0,9,14,18,24,652167,69867,27167,26722,899,0.025613,0.024957,0.033144,0.032046,0.031089,0.030124,0.030213,0.029302,0.037344,0.036000
3,11380,-1,0,1,-1,-1,0,0,111.0,1.0,0,1.0,-1.0,-1.0,1.0,134.0,1114.0,143.0,16.0,12.0,12.0,9,14,18,24,652167,69867,27167,26722,1174,0.025613,0.024957,0.033144,0.032046,0.031089,0.030124,0.030213,0.029302,0.029536,0.028694
4,5490,-1,0,1,-1,-1,0,0,116.0,1.0,0,-1.0,-1.0,-1.0,1.0,143.0,945.0,126.0,13.0,9.0,9.0,9,14,18,24,652167,69867,27167,26722,1184,0.025613,0.024957,0.033144,0.032046,0.031089,0.030124,0.030213,0.029302,0.034519,0.033362
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1132902,13194,-1,0,2,-1,-1,0,0,44.0,1.0,0,-1.0,-1.0,-1.0,-1.0,49.0,4433.0,517.0,65.0,26.0,19.0,9,29,172,281,652167,75416,22039,18112,765,0.025665,0.025006,0.028735,0.027909,0.028974,0.028136,0.027932,0.027153,0.019449,0.019102
1132903,10557,-1,0,2,-1,-1,0,0,98.0,1.0,0,1.0,-1.0,-1.0,1.0,59.0,3745.0,857.0,500.0,97.0,25.0,5,13,17,23,1347188,739858,147673,38168,212,0.023402,0.022854,0.023384,0.022838,0.020215,0.019807,0.021140,0.020694,0.023529,0.023112
1132904,7326,-1,0,2,-1,-1,0,0,98.0,1.0,1,-1.0,-1.0,-1.0,-1.0,53.0,4141.0,561.0,251.0,13.0,8.0,4,12,174,284,867127,410661,16265,8258,305,0.023467,0.022916,0.021832,0.021356,0.037252,0.035867,0.037175,0.035798,0.021097,0.020739
1132905,8033,-1,0,2,-1,-1,0,0,100.0,1.0,0,1.0,-1.0,-1.0,1.0,54.0,4074.0,907.0,531.0,24.0,13.0,5,13,15,153,1347188,739858,21332,8164,497,0.023402,0.022854,0.023384,0.022838,0.029605,0.028731,0.035905,0.034621,0.028947,0.028184


In [41]:
test_df[features]

Unnamed: 0,yado_no,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,popularity,overall_rank,wid_cd_rank,ken_cd_rank,lrg_cd_rank,sml_cd_rank,label_wid_cd,label_ken_cd,label_lrg_cd,label_sml_cd,count_wid_cd,count_ken_cd,count_lrg_cd,count_sml_cd,count_yado_no,TE_wid_cd_mean,TE_wid_cd_var,TE_ken_cd_mean,TE_ken_cd_var,TE_lrg_cd_mean,TE_lrg_cd_var,TE_sml_cd_mean,TE_sml_cd_var,TE_yado_no_mean,TE_yado_no_var
0,11561,-1,0,1,-1,-1,0,0,195.0,1.0,0,1.0,-1.0,-1.0,1.0,195.0,446.0,138.0,59.0,5.0,1.0,5,13,109,167,1347188,739858,43373,19575,1517.0,0.023444,0.022895,0.023417,0.022868,0.025131,0.024500,0.023755,0.023192,0.024390,0.023811
1,4714,-1,0,1,-1,-1,0,0,58.0,1.0,0,-1.0,-1.0,-1.0,1.0,83.0,2535.5,612.0,355.0,24.0,11.0,5,13,109,167,1347188,739858,43373,19575,812.0,0.023444,0.022895,0.023417,0.022868,0.025131,0.024500,0.023755,0.023192,0.023399,0.022880
2,2680,-1,0,1,-1,-1,0,0,150.0,1.0,0,1.0,-1.0,-1.0,1.0,137.0,1056.0,282.0,140.0,11.0,4.0,5,13,109,167,1347188,739858,43373,19575,1320.0,0.023444,0.022895,0.023417,0.022868,0.025131,0.024500,0.023755,0.023192,0.032576,0.031538
3,4420,-1,0,1,-1,-1,0,0,124.0,1.0,0,1.0,-1.0,-1.0,1.0,67.0,3274.0,760.0,444.0,29.0,16.0,5,13,109,167,1347188,739858,43373,19575,610.0,0.023444,0.022895,0.023417,0.022868,0.025131,0.024500,0.023755,0.023192,0.027869,0.027137
4,5466,-1,0,1,-1,-1,0,0,130.0,1.0,0,1.0,-1.0,-1.0,1.0,126.0,1266.5,334.0,174.0,14.0,5.0,5,13,109,167,1347188,739858,43373,19575,1524.0,0.023444,0.022895,0.023417,0.022868,0.025131,0.024500,0.023755,0.023192,0.025591,0.024952
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3430599,385,-1,0,0,-1,-1,0,0,2384.0,1.0,0,1.0,-1.0,-1.0,1.0,826.0,14.0,10.0,1.0,1.0,1.0,5,13,16,22,1347188,739858,116820,62449,6404.0,0.023444,0.022895,0.023417,0.022868,0.019072,0.018708,0.018911,0.018554,0.021861,0.021387
3430600,7690,-1,0,0,-1,-1,0,0,465.0,1.0,0,1.0,-1.0,-1.0,1.0,107.0,1723.5,455.0,245.0,18.0,5.0,5,13,61,151,1347188,739858,68231,25883,1247.0,0.023444,0.022895,0.023417,0.022868,0.030411,0.029487,0.030638,0.029700,0.028067,0.027301
3430601,6091,-1,0,0,-1,-1,0,0,603.0,1.0,1,1.0,-1.0,-1.0,1.0,162.0,703.0,194.0,92.0,17.0,4.0,5,13,16,21,1347188,739858,116820,30110,1773.0,0.023444,0.022895,0.023417,0.022868,0.019072,0.018708,0.020359,0.019945,0.015228,0.015005
3430602,9631,-1,0,0,-1,-1,0,0,394.0,1.0,0,-1.0,-1.0,-1.0,1.0,176.0,576.0,165.0,74.0,2.0,2.0,5,13,15,19,1347188,739858,21332,13168,1438.0,0.023444,0.022895,0.023417,0.022868,0.029346,0.028486,0.025440,0.024795,0.011822,0.011690


In [42]:
train_df.columns.to_list()

['session_id',
 'yado_no',
 'target',
 'seq_no',
 'seen_yad',
 'max_seq_no',
 'diff_seq_no',
 'is_odd',
 'multiple_visits_flag',
 'yad_type',
 'total_room_cnt',
 'wireless_lan_flg',
 'onsen_flg',
 'kd_stn_5min',
 'kd_bch_5min',
 'kd_slp_5min',
 'kd_conv_walk_5min',
 'wid_cd',
 'ken_cd',
 'lrg_cd',
 'sml_cd',
 'popularity',
 'overall_rank',
 'wid_cd_rank',
 'ken_cd_rank',
 'lrg_cd_rank',
 'sml_cd_rank',
 'label_wid_cd',
 'label_ken_cd',
 'label_lrg_cd',
 'label_sml_cd',
 'count_wid_cd',
 'count_ken_cd',
 'count_lrg_cd',
 'count_sml_cd',
 'count_yado_no',
 'fold',
 'TE_wid_cd_mean',
 'TE_wid_cd_var',
 'TE_ken_cd_mean',
 'TE_ken_cd_var',
 'TE_lrg_cd_mean',
 'TE_lrg_cd_var',
 'TE_sml_cd_mean',
 'TE_sml_cd_var',
 'TE_yado_no_mean',
 'TE_yado_no_var']

In [43]:
import gc
gc.collect()

105

In [44]:
test_df

Unnamed: 0,session_id,yado_no,target,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd,popularity,overall_rank,wid_cd_rank,ken_cd_rank,lrg_cd_rank,sml_cd_rank,label_wid_cd,label_ken_cd,label_lrg_cd,label_sml_cd,count_wid_cd,count_ken_cd,count_lrg_cd,count_sml_cd,count_yado_no,TE_wid_cd_mean,TE_wid_cd_var,TE_ken_cd_mean,TE_ken_cd_var,TE_lrg_cd_mean,TE_lrg_cd_var,TE_sml_cd_mean,TE_sml_cd_var,TE_yado_no_mean,TE_yado_no_var
0,00001149e9c73985425197104712478c,11561,0,-1,0,1,-1,-1,0,0,195.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,195.0,446.0,138.0,59.0,5.0,1.0,5,13,109,167,1347188,739858,43373,19575,1517.0,0.023444,0.022895,0.023417,0.022868,0.025131,0.024500,0.023755,0.023192,0.024390,0.023811
1,00001149e9c73985425197104712478c,4714,0,-1,0,1,-1,-1,0,0,58.0,1.0,0,-1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,83.0,2535.5,612.0,355.0,24.0,11.0,5,13,109,167,1347188,739858,43373,19575,812.0,0.023444,0.022895,0.023417,0.022868,0.025131,0.024500,0.023755,0.023192,0.023399,0.022880
2,00001149e9c73985425197104712478c,2680,0,-1,0,1,-1,-1,0,0,150.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,137.0,1056.0,282.0,140.0,11.0,4.0,5,13,109,167,1347188,739858,43373,19575,1320.0,0.023444,0.022895,0.023417,0.022868,0.025131,0.024500,0.023755,0.023192,0.032576,0.031538
3,00001149e9c73985425197104712478c,4420,0,-1,0,1,-1,-1,0,0,124.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,67.0,3274.0,760.0,444.0,29.0,16.0,5,13,109,167,1347188,739858,43373,19575,610.0,0.023444,0.022895,0.023417,0.022868,0.025131,0.024500,0.023755,0.023192,0.027869,0.027137
4,00001149e9c73985425197104712478c,5466,0,-1,0,1,-1,-1,0,0,130.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,126.0,1266.5,334.0,174.0,14.0,5.0,5,13,109,167,1347188,739858,43373,19575,1524.0,0.023444,0.022895,0.023417,0.022868,0.025131,0.024500,0.023755,0.023192,0.025591,0.024952
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3430599,ffffe984aafd6127ce8e43e3ca40c79d,385,0,-1,0,0,-1,-1,0,0,2384.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,3a6cd37aa9e38fd96d9dafc2615643d0,2ffd60eb648dbbaa1d1ba33644813c44,826.0,14.0,10.0,1.0,1.0,1.0,5,13,16,22,1347188,739858,116820,62449,6404.0,0.023444,0.022895,0.023417,0.022868,0.019072,0.018708,0.018911,0.018554,0.021861,0.021387
3430600,ffffe984aafd6127ce8e43e3ca40c79d,7690,0,-1,0,0,-1,-1,0,0,465.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,c9d5e891463e5389c42d16f987ed30bd,7cf2b4f31fb20747f89e58981e6d9fc1,107.0,1723.5,455.0,245.0,18.0,5.0,5,13,61,151,1347188,739858,68231,25883,1247.0,0.023444,0.022895,0.023417,0.022868,0.030411,0.029487,0.030638,0.029700,0.028067,0.027301
3430601,ffffe984aafd6127ce8e43e3ca40c79d,6091,0,-1,0,0,-1,-1,0,0,603.0,1.0,1,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,3a6cd37aa9e38fd96d9dafc2615643d0,4683b842facadc1ad7161f72220f6a3e,162.0,703.0,194.0,92.0,17.0,4.0,5,13,16,21,1347188,739858,116820,30110,1773.0,0.023444,0.022895,0.023417,0.022868,0.019072,0.018708,0.020359,0.019945,0.015228,0.015005
3430602,ffffe984aafd6127ce8e43e3ca40c79d,9631,0,-1,0,0,-1,-1,0,0,394.0,1.0,0,-1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52c9ea83f2cfe92be54cb6bc961edf21,1cc3e1838bb0fd0fde0396130b1f82b9,176.0,576.0,165.0,74.0,2.0,2.0,5,13,15,19,1347188,739858,21332,13168,1438.0,0.023444,0.022895,0.023417,0.022868,0.029346,0.028486,0.025440,0.024795,0.011822,0.011690


In [45]:
train_df

Unnamed: 0,session_id,yado_no,target,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd,popularity,overall_rank,wid_cd_rank,ken_cd_rank,lrg_cd_rank,sml_cd_rank,label_wid_cd,label_ken_cd,label_lrg_cd,label_sml_cd,count_wid_cd,count_ken_cd,count_lrg_cd,count_sml_cd,count_yado_no,fold,TE_wid_cd_mean,TE_wid_cd_var,TE_ken_cd_mean,TE_ken_cd_var,TE_lrg_cd_mean,TE_lrg_cd_var,TE_sml_cd_mean,TE_sml_cd_var,TE_yado_no_mean,TE_yado_no_var
0,000104bdffaaad1a1e0a9ebacf585f33,3894,0,-1,0,1,-1,-1,0,0,205.0,1.0,0,-1.0,-1.0,-1.0,1.0,e9316013ee1b03f4525fe361c46ce9c5,84efa50e52f9b471c95bfc3b21b854ad,a1370d90ed3b80ee41311bbbab46aec9,d72674f02c5340d90f245e3177727650,212.0,364.0,40.0,3.0,1.0,1.0,9,14,18,24,652167,69867,27167,26722,1089,0,0.025613,0.024957,0.033144,0.032046,0.031089,0.030124,0.030213,0.029302,0.034130,0.033002
1,000104bdffaaad1a1e0a9ebacf585f33,7749,0,-1,0,1,-1,-1,0,0,76.0,1.0,0,-1.0,-1.0,-1.0,1.0,e9316013ee1b03f4525fe361c46ce9c5,84efa50e52f9b471c95bfc3b21b854ad,a1370d90ed3b80ee41311bbbab46aec9,d72674f02c5340d90f245e3177727650,150.0,845.5,113.0,10.0,6.0,6.0,9,14,18,24,652167,69867,27167,26722,1291,0,0.025613,0.024957,0.033144,0.032046,0.031089,0.030124,0.030213,0.029302,0.033365,0.032283
2,000104bdffaaad1a1e0a9ebacf585f33,902,0,-1,0,1,-1,-1,0,0,240.0,1.0,0,-1.0,-1.0,-1.0,1.0,e9316013ee1b03f4525fe361c46ce9c5,84efa50e52f9b471c95bfc3b21b854ad,a1370d90ed3b80ee41311bbbab46aec9,d72674f02c5340d90f245e3177727650,154.0,785.5,103.0,9.0,5.0,5.0,9,14,18,24,652167,69867,27167,26722,899,0,0.025613,0.024957,0.033144,0.032046,0.031089,0.030124,0.030213,0.029302,0.037344,0.036000
3,000104bdffaaad1a1e0a9ebacf585f33,11380,0,-1,0,1,-1,-1,0,0,111.0,1.0,0,1.0,-1.0,-1.0,1.0,e9316013ee1b03f4525fe361c46ce9c5,84efa50e52f9b471c95bfc3b21b854ad,a1370d90ed3b80ee41311bbbab46aec9,d72674f02c5340d90f245e3177727650,134.0,1114.0,143.0,16.0,12.0,12.0,9,14,18,24,652167,69867,27167,26722,1174,0,0.025613,0.024957,0.033144,0.032046,0.031089,0.030124,0.030213,0.029302,0.029536,0.028694
4,000104bdffaaad1a1e0a9ebacf585f33,5490,0,-1,0,1,-1,-1,0,0,116.0,1.0,0,-1.0,-1.0,-1.0,1.0,e9316013ee1b03f4525fe361c46ce9c5,84efa50e52f9b471c95bfc3b21b854ad,a1370d90ed3b80ee41311bbbab46aec9,d72674f02c5340d90f245e3177727650,143.0,945.0,126.0,13.0,9.0,9.0,9,14,18,24,652167,69867,27167,26722,1184,0,0.025613,0.024957,0.033144,0.032046,0.031089,0.030124,0.030213,0.029302,0.034519,0.033362
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1132902,fffe8a472ae6a96c9da05a30ac3ed6c5,13194,0,-1,0,2,-1,-1,0,0,44.0,1.0,0,-1.0,-1.0,-1.0,-1.0,e9316013ee1b03f4525fe361c46ce9c5,517061b8165aa6370d9025951a64aa52,7e5ebb4d5e3cdfd3ad3798c9864d87d6,840f2157ec5bb4f5501a3ace2f4ef8d1,49.0,4433.0,517.0,65.0,26.0,19.0,9,29,172,281,652167,75416,22039,18112,765,4,0.025665,0.025006,0.028735,0.027909,0.028974,0.028136,0.027932,0.027153,0.019449,0.019102
1132903,fffe8a472ae6a96c9da05a30ac3ed6c5,10557,0,-1,0,2,-1,-1,0,0,98.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,e2034d4f2fbe0874f8ac58f6f111f182,c9807ee4ac6e452876bfc81fcb309b7c,59.0,3745.0,857.0,500.0,97.0,25.0,5,13,17,23,1347188,739858,147673,38168,212,4,0.023402,0.022854,0.023384,0.022838,0.020215,0.019807,0.021140,0.020694,0.023529,0.023112
1132904,fffe8a472ae6a96c9da05a30ac3ed6c5,7326,0,-1,0,2,-1,-1,0,0,98.0,1.0,1,-1.0,-1.0,-1.0,-1.0,f0112abf369fb03cdc5f5309300913da,072c85e1653e10c9c7dd065ad007125a,74dbf7abd84e4b61db680d83cc6d338f,055193099f80ad30c78cd614d7dcea5a,53.0,4141.0,561.0,251.0,13.0,8.0,4,12,174,284,867127,410661,16265,8258,305,4,0.023467,0.022916,0.021832,0.021356,0.037252,0.035867,0.037175,0.035798,0.021097,0.020739
1132905,fffe8a472ae6a96c9da05a30ac3ed6c5,8033,0,-1,0,2,-1,-1,0,0,100.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52c9ea83f2cfe92be54cb6bc961edf21,8bcb19738ed91018e399e139aca4d54b,54.0,4074.0,907.0,531.0,24.0,13.0,5,13,15,153,1347188,739858,21332,8164,497,4,0.023402,0.022854,0.023384,0.022838,0.029605,0.028731,0.035905,0.034621,0.028947,0.028184


In [48]:
print(train_df['seen_yad'].value_counts())
print(test_df['seen_yad'].value_counts())

0    5664507
Name: seen_yad, dtype: int64
0    3430604
Name: seen_yad, dtype: int64


In [46]:
train_df.to_parquet('../data/feature_engineering_v7_train_df.parquet', index=False)
test_df.to_parquet('../data/feature_engineering_v7_test_df.parquet', index=False)