# seen_yado用の特徴量エンジニアリング

In [20]:
import numpy as np
import pandas as pd
from tqdm import tqdm


from sklearn.model_selection import KFold
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import StratifiedKFold
from nyaggle.feature.category_encoder import TargetEncoder

from nyaggle.experiment import run_experiment

#最大表示列数の指定（ここでは50列を指定）
pd.set_option('display.max_columns', 50)
# 最大表示行数の指定（ここでは50行を指定）
pd.set_option('display.max_rows', 500)

In [2]:
train_log_df = pd.read_csv('../train_log.csv')
test_log_df = pd.read_csv('../test_log.csv')
train_label_df = pd.read_csv('../train_label.csv')
test_session_df = pd.read_csv('../test_session.csv')
yado_df = pd.read_csv('../yado.csv')
# image_embeddings_df = pd.read_parquet('../image_embeddings.parquet')
sample_submission_df = pd.read_csv('../sample_submission.csv')

candidate_train_df = pd.read_csv('../data/candidate_ver15_train.csv')
candidate_test_df = pd.read_csv('../data/candidate_ver15_test.csv')

#### Trainに正例と負例のフラグを付与

In [3]:
# 学習用データ
# 教師データに正例と負例のフラグを付与
train_target = []

# 各セッションに対して候補の宿をチェック
for index, row in tqdm(train_label_df.iterrows(), total=train_label_df.shape[0]):
    session_id = row['session_id']
    yad_no = row['yad_no']
    
    # candidate_ver7_train.csv の該当行を取得（同じインデックスを仮定）
    candidate_row = candidate_train_df.iloc[index]

    # 50個の購入候補の宿に対してチェック
    for col in candidate_train_df.columns:
        yado_no = candidate_row[col]
        is_correct = (yado_no == yad_no)  # 一致するかどうかのチェック
        train_target.append([session_id, yado_no, is_correct])

# 結果をDataFrameに変換
train_df = pd.DataFrame(train_target, columns=['session_id', 'yado_no', 'target']) #区別するためにあえてyad_noではなくyado_noとしている
train_df['target'] = train_df['target'].astype(int)  # target列をint型に変換

# 最初の数行を表示して確認
print(train_df.head())

print('正例と負例の数を確認')
print(train_df['target'].value_counts())

100%|██████████| 288698/288698 [00:42<00:00, 6849.21it/s]


                         session_id  yado_no  target
0  000007603d533d30453cc45d0f3d119f    11882       0
1  000007603d533d30453cc45d0f3d119f     2808       0
2  000007603d533d30453cc45d0f3d119f     5289       0
3  000007603d533d30453cc45d0f3d119f     4101       1
4  000007603d533d30453cc45d0f3d119f     3324       0
正例と負例の数を確認
0    2697380
1     189600
Name: target, dtype: int64


#### 推論用データの作成

In [4]:
# 推論用データにダミーのyado_noを付与
test_session_df['yad_no'] = -1

test_target = []

# 各セッションに対して候補の宿をチェック
for index, row in tqdm(test_session_df.iterrows(), total=test_session_df.shape[0]):
    session_id = row['session_id']
    yad_no = row['yad_no']
    
    # candidate_ver7_test.csv の該当行を取得（同じインデックスを仮定）
    candidate_row = candidate_test_df.iloc[index]

    # 50個の購入候補の宿に対してチェック
    for col in candidate_test_df.columns:
        yado_no = candidate_row[col]
        is_correct = (yado_no == yad_no)  # 一致するかどうかのチェック
        test_target.append([session_id, yado_no, is_correct])

# 結果をDataFrameに変換
test_df = pd.DataFrame(test_target, columns=['session_id', 'yado_no', 'target']) #区別するためにあえてyad_noではなくyado_noとしている
test_df['target'] = test_df['target'].astype(int)  # target列をint型に変換

# 最初の数行を表示して確認
print(test_df.head())

print('正例と負例の数を確認')
print(test_df['target'].value_counts()) # すべて0になるはず

100%|██████████| 174700/174700 [00:25<00:00, 6851.03it/s]


                         session_id  yado_no  target
0  00001149e9c73985425197104712478c     3560       0
1  00001149e9c73985425197104712478c    11561       0
2  00001149e9c73985425197104712478c     4714       0
3  00001149e9c73985425197104712478c     2680       0
4  00001149e9c73985425197104712478c     4420       0
正例と負例の数を確認
0    1747000
Name: target, dtype: int64


#### seq_noを追加

In [5]:
# seq_noをマージする

# train_log の session_id と yad_no の組み合わせに基づいて辞書を作成
seq_no_dict_train = dict(zip(zip(train_log_df['session_id'], train_log_df['yad_no']), train_log_df['seq_no']))
# train_df に seq_no 列を追加（tqdm で進捗表示）
train_df['seq_no'] = [seq_no_dict_train.get((row['session_id'], row['yado_no']), -1) for row in tqdm(train_df.to_dict('records'))]
# 結果の確認
print(train_df.head())


# test_log の session_id と yad_no の組み合わせに基づいて辞書を作成
seq_no_dict_test = dict(zip(zip(test_log_df['session_id'], test_log_df['yad_no']), test_log_df['seq_no']))
# test_df に seq_no 列を追加（tqdm で進捗表示）
test_df['seq_no'] = [seq_no_dict_test.get((row['session_id'], row['yado_no']), -1) for row in tqdm(test_df.to_dict('records'))]
# 結果の確認
print(test_df.head())

100%|██████████| 2886980/2886980 [00:01<00:00, 2446915.43it/s]


                         session_id  yado_no  target  seq_no
0  000007603d533d30453cc45d0f3d119f    11882       0      -1
1  000007603d533d30453cc45d0f3d119f     2808       0      -1
2  000007603d533d30453cc45d0f3d119f     5289       0      -1
3  000007603d533d30453cc45d0f3d119f     4101       1      -1
4  000007603d533d30453cc45d0f3d119f     3324       0      -1


100%|██████████| 1747000/1747000 [00:00<00:00, 2559585.79it/s]


                         session_id  yado_no  target  seq_no
0  00001149e9c73985425197104712478c     3560       0       0
1  00001149e9c73985425197104712478c    11561       0      -1
2  00001149e9c73985425197104712478c     4714       0      -1
3  00001149e9c73985425197104712478c     2680       0      -1
4  00001149e9c73985425197104712478c     4420       0      -1


In [21]:
train_df.head(500)

Unnamed: 0,session_id,yado_no,target,seq_no,seen_yad
0,000007603d533d30453cc45d0f3d119f,11882,0,-1,0
1,000007603d533d30453cc45d0f3d119f,2808,0,-1,0
2,000007603d533d30453cc45d0f3d119f,5289,0,-1,0
3,000007603d533d30453cc45d0f3d119f,4101,1,-1,0
4,000007603d533d30453cc45d0f3d119f,3324,0,-1,0
5,000007603d533d30453cc45d0f3d119f,12846,0,-1,0
6,000007603d533d30453cc45d0f3d119f,997,0,-1,0
7,000007603d533d30453cc45d0f3d119f,9207,0,-1,0
8,000007603d533d30453cc45d0f3d119f,9209,0,-1,0
9,000007603d533d30453cc45d0f3d119f,9208,0,-1,0


In [22]:
train_df[train_df['session_id'] == '0007dd71a9a78c567084374a66e38139']

Unnamed: 0,session_id,yado_no,target,seq_no,seen_yad
360,0007dd71a9a78c567084374a66e38139,2927,1,4,1
361,0007dd71a9a78c567084374a66e38139,6199,0,-1,0
362,0007dd71a9a78c567084374a66e38139,12089,0,-1,0
363,0007dd71a9a78c567084374a66e38139,12425,0,-1,0
364,0007dd71a9a78c567084374a66e38139,13386,0,-1,0
365,0007dd71a9a78c567084374a66e38139,11850,0,-1,0
366,0007dd71a9a78c567084374a66e38139,9137,0,-1,0
367,0007dd71a9a78c567084374a66e38139,12986,0,-1,0
368,0007dd71a9a78c567084374a66e38139,2452,0,-1,0
369,0007dd71a9a78c567084374a66e38139,2318,0,-1,0


In [23]:
train_label_df[train_label_df['session_id'] == '0007dd71a9a78c567084374a66e38139']

Unnamed: 0,session_id,yad_no
36,0007dd71a9a78c567084374a66e38139,2927


In [24]:
# 複数あると一番番号が高い4の判定になる
train_log_df[train_log_df['session_id'] == '0007dd71a9a78c567084374a66e38139']

Unnamed: 0,session_id,seq_no,yad_no
45,0007dd71a9a78c567084374a66e38139,0,2927
46,0007dd71a9a78c567084374a66e38139,1,11037
47,0007dd71a9a78c567084374a66e38139,2,2927
48,0007dd71a9a78c567084374a66e38139,3,11037
49,0007dd71a9a78c567084374a66e38139,4,2927
50,0007dd71a9a78c567084374a66e38139,5,11037


In [25]:
#　※複数のseq_noがある場合は、0も混入する
print(train_df['seq_no'].value_counts())
print(test_df['seq_no'].value_counts())

-1    2777527
 0      88128
 1      16052
 2       4125
 3        837
 4        223
 5         65
 6         18
 7          4
 8          1
Name: seq_no, dtype: int64
-1    1683604
 0      51884
 1       8692
 2       2239
 3        428
 4        124
 5         22
 6          7
Name: seq_no, dtype: int64


#### 候補の宿がsession中に閲覧されていたかどうか

In [26]:
# 'seq_no'が−1でなければ1のフラグを立てる
train_df['seen_yad'] = train_df['seq_no'].apply(lambda x: 1 if x != -1 else 0)
test_df['seen_yad'] = test_df['seq_no'].apply(lambda x: 1 if x != -1 else 0)

In [27]:
print(train_df['seen_yad'].value_counts())  
print(test_df['seen_yad'].value_counts())

0    2777527
1     109453
Name: seen_yad, dtype: int64
0    1683604
1      63396
Name: seen_yad, dtype: int64


In [28]:
train_df[train_df['seen_yad']==1].target.value_counts()

1    87693
0    21760
Name: target, dtype: int64

#### 各sessionにおける最大seq_no

In [29]:
# 各セッションの seq_no の最大値を計算
max_seq_no_train = train_log_df.groupby('session_id')['seq_no'].max().reset_index()
max_seq_no_train.rename(columns={'seq_no': 'max_seq_no'}, inplace=True)
# teacher_df と max_seq_no のマージ
train_df = train_df.merge(max_seq_no_train, on='session_id', how='left')
# 結果の確認
print(train_df.head())


# 各セッションの seq_no の最大値を計算
max_seq_no_test = test_log_df.groupby('session_id')['seq_no'].max().reset_index()
max_seq_no_test.rename(columns={'seq_no': 'max_seq_no'}, inplace=True)
# teacher_df と max_seq_no のマージ
test_df = test_df.merge(max_seq_no_test, on='session_id', how='left')
# 結果の確認
print(test_df.head())

                         session_id  yado_no  target  seq_no  seen_yad  \
0  000007603d533d30453cc45d0f3d119f    11882       0      -1         0   
1  000007603d533d30453cc45d0f3d119f     2808       0      -1         0   
2  000007603d533d30453cc45d0f3d119f     5289       0      -1         0   
3  000007603d533d30453cc45d0f3d119f     4101       1      -1         0   
4  000007603d533d30453cc45d0f3d119f     3324       0      -1         0   

   max_seq_no  
0           0  
1           0  
2           0  
3           0  
4           0  
                         session_id  yado_no  target  seq_no  seen_yad  \
0  00001149e9c73985425197104712478c     3560       0       0         1   
1  00001149e9c73985425197104712478c    11561       0      -1         0   
2  00001149e9c73985425197104712478c     4714       0      -1         0   
3  00001149e9c73985425197104712478c     2680       0      -1         0   
4  00001149e9c73985425197104712478c     4420       0      -1         0   

   max_seq_no 

In [30]:
print(train_df['max_seq_no'].value_counts())
print(test_df['max_seq_no'].value_counts())

0    1853860
1     827930
2     153500
3      40250
4       8330
5       2230
6        650
7        180
8         40
9         10
Name: max_seq_no, dtype: int64
0    1139400
1     494930
2      84590
3      22270
4       4280
5       1240
6        220
7         70
Name: max_seq_no, dtype: int64


#### 差分の考慮：max_seq_noから(-1以外の要素)でseq_noを引く

In [31]:
# 最初に全ての diff_seq_no を -1 に設定
train_df['diff_seq_no'] = -1
test_df['diff_seq_no'] = -1

# seq_no が -1 以外の行にのみ max_seq_no - seq_no の計算を適用
train_df.loc[train_df['seq_no'] != -1, 'diff_seq_no'] = train_df['max_seq_no'] - train_df['seq_no']
test_df.loc[test_df['seq_no'] != -1, 'diff_seq_no'] = test_df['max_seq_no'] - test_df['seq_no']

# 結果の確認
print(train_df.head())
print(test_df.head())

                         session_id  yado_no  target  seq_no  seen_yad  \
0  000007603d533d30453cc45d0f3d119f    11882       0      -1         0   
1  000007603d533d30453cc45d0f3d119f     2808       0      -1         0   
2  000007603d533d30453cc45d0f3d119f     5289       0      -1         0   
3  000007603d533d30453cc45d0f3d119f     4101       1      -1         0   
4  000007603d533d30453cc45d0f3d119f     3324       0      -1         0   

   max_seq_no  diff_seq_no  
0           0           -1  
1           0           -1  
2           0           -1  
3           0           -1  
4           0           -1  
                         session_id  yado_no  target  seq_no  seen_yad  \
0  00001149e9c73985425197104712478c     3560       0       0         1   
1  00001149e9c73985425197104712478c    11561       0      -1         0   
2  00001149e9c73985425197104712478c     4714       0      -1         0   
3  00001149e9c73985425197104712478c     2680       0      -1         0   
4  00001149

In [32]:
print(train_df['diff_seq_no'].value_counts())
print(test_df['diff_seq_no'].value_counts())

-1    2777527
 1     103312
 2       5684
 3        407
 4         49
 5          1
Name: diff_seq_no, dtype: int64
-1    1683604
 1      60760
 2       2528
 3        106
 4          2
Name: diff_seq_no, dtype: int64


#### diff_seq_no が奇数かどうかの判定 

In [33]:
# diff_seq_no が奇数かどうかの判定（seq_no が -1 の場合は除外）
train_df['is_odd'] = np.where(train_df['seq_no'] != -1, train_df['diff_seq_no'] % 2 == 1, -1)
test_df['is_odd'] = np.where(test_df['seq_no'] != -1, test_df['diff_seq_no'] % 2 == 1, -1)

# seq_no が -1 以外の場合、Trueを0、Falseを1に変換（すでに -1 の場合は変更しない）
train_df['is_odd'] = np.where(train_df['is_odd'] != -1, np.where(train_df['is_odd'], 0, 1), -1)
test_df['is_odd'] = np.where(test_df['is_odd'] != -1, np.where(test_df['is_odd'], 0, 1), -1)

# 結果の確認
print(train_df.head())
print(test_df.head())

                         session_id  yado_no  target  seq_no  seen_yad  \
0  000007603d533d30453cc45d0f3d119f    11882       0      -1         0   
1  000007603d533d30453cc45d0f3d119f     2808       0      -1         0   
2  000007603d533d30453cc45d0f3d119f     5289       0      -1         0   
3  000007603d533d30453cc45d0f3d119f     4101       1      -1         0   
4  000007603d533d30453cc45d0f3d119f     3324       0      -1         0   

   max_seq_no  diff_seq_no  is_odd  
0           0           -1      -1  
1           0           -1      -1  
2           0           -1      -1  
3           0           -1      -1  
4           0           -1      -1  
                         session_id  yado_no  target  seq_no  seen_yad  \
0  00001149e9c73985425197104712478c     3560       0       0         1   
1  00001149e9c73985425197104712478c    11561       0      -1         0   
2  00001149e9c73985425197104712478c     4714       0      -1         0   
3  00001149e9c73985425197104712478c  

In [34]:
print(train_df['is_odd'].value_counts())
print(test_df['is_odd'].value_counts())

-1    2777527
 0     103720
 1       5733
Name: is_odd, dtype: int64
-1    1683604
 0      60866
 1       2530
Name: is_odd, dtype: int64


#### 各sessionにおいて2回以上出現したyad_noがあれば1のフラグを立てる

In [35]:
# 各セッションにおいて2回以上登場する yad_no を特定
yad_no_counts = train_log_df.groupby(['session_id', 'yad_no']).size().reset_index(name='count')
multiple_visits = yad_no_counts[yad_no_counts['count'] > 1]

# (session_id, yad_no) のタプルをキーとして辞書を作成
multiple_visits_dict = {(row['session_id'], row['yad_no']): 1 for _, row in multiple_visits.iterrows()}

# teacher_df に multiple_visits_flag 列を追加（tqdm で進捗表示）
train_df['multiple_visits_flag'] = [multiple_visits_dict.get((row['session_id'], row['yado_no']), 0) 
                                      for row in tqdm(train_df.to_dict('records'))]

# 結果の確認
print(train_df.head())



# 各セッションにおいて2回以上登場する yad_no を特定
yad_no_counts = test_log_df.groupby(['session_id', 'yad_no']).size().reset_index(name='count')
multiple_visits = yad_no_counts[yad_no_counts['count'] > 1]

# (session_id, yad_no) のタプルをキーとして辞書を作成
multiple_visits_dict = {(row['session_id'], row['yad_no']): 1 for _, row in multiple_visits.iterrows()}

# teacher_df に multiple_visits_flag 列を追加（tqdm で進捗表示）
test_df['multiple_visits_flag'] = [multiple_visits_dict.get((row['session_id'], row['yado_no']), 0) 
                                      for row in tqdm(test_df.to_dict('records'))]

# 結果の確認
print(test_df.head())

100%|██████████| 2886980/2886980 [00:00<00:00, 3117450.20it/s]


                         session_id  yado_no  target  seq_no  seen_yad  \
0  000007603d533d30453cc45d0f3d119f    11882       0      -1         0   
1  000007603d533d30453cc45d0f3d119f     2808       0      -1         0   
2  000007603d533d30453cc45d0f3d119f     5289       0      -1         0   
3  000007603d533d30453cc45d0f3d119f     4101       1      -1         0   
4  000007603d533d30453cc45d0f3d119f     3324       0      -1         0   

   max_seq_no  diff_seq_no  is_odd  multiple_visits_flag  
0           0           -1      -1                     0  
1           0           -1      -1                     0  
2           0           -1      -1                     0  
3           0           -1      -1                     0  
4           0           -1      -1                     0  


100%|██████████| 1747000/1747000 [00:00<00:00, 3147295.79it/s]


                         session_id  yado_no  target  seq_no  seen_yad  \
0  00001149e9c73985425197104712478c     3560       0       0         1   
1  00001149e9c73985425197104712478c    11561       0      -1         0   
2  00001149e9c73985425197104712478c     4714       0      -1         0   
3  00001149e9c73985425197104712478c     2680       0      -1         0   
4  00001149e9c73985425197104712478c     4420       0      -1         0   

   max_seq_no  diff_seq_no  is_odd  multiple_visits_flag  
0           1            1       0                     0  
1           1           -1      -1                     0  
2           1           -1      -1                     0  
3           1           -1      -1                     0  
4           1           -1      -1                     0  


In [36]:
print(train_df['multiple_visits_flag'].value_counts())
print(test_df['multiple_visits_flag'].value_counts())

0    2882332
1       4648
Name: multiple_visits_flag, dtype: int64
0    1744359
1       2641
Name: multiple_visits_flag, dtype: int64


### yado_dfに関する特徴量エンジニアリング

In [37]:
yado_df

Unnamed: 0,yad_no,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd
0,1,0,129.0,1.0,0,1.0,,,1.0,f0112abf369fb03cdc5f5309300913da,072c85e1653e10c9c7dd065ad007125a,449c52ef581d5f9ef311189469a0520e,677a32689cd1ad74e867f1fbe43a3e1c
1,2,0,23.0,1.0,0,,,,,d86102dd9c232bade9a97dccad40df48,b4d2fb4e51ea7bca80eb1270aa474a54,5c9a8f48e9df0234da012747a02d4b29,4ee16ee838dd2703cc9a1d5a535f0ced
2,3,0,167.0,1.0,1,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,572d60f0f5212aacda515ebf81fb0a3a,8a623b960557e87bd1f4edf71b6255be,ab9480fd72a44d51690ab16c4ad4d49c
3,4,0,144.0,1.0,0,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52c9ea83f2cfe92be54cb6bc961edf21,1cc3e1838bb0fd0fde0396130b1f82b9
4,5,0,41.0,1.0,1,,,,,43875109d1dab93592812c50d18270a7,75617bb07a2785a948ab1958909211f1,9ea5a911019b66ccd42f556c42a2fe2f,be1b876af18afc4deeb3081591d2a910
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13801,13802,0,10.0,1.0,1,,,,,c312e07b7a5d456d53a5b00910a336e1,558ac1909f0318b82c621ab250329d6d,80fb3c5ad0c89931d0923e9f80885218,5eb30820716082c720836733d73c605e
13802,13803,0,,,0,1.0,,,1.0,dc414a17890cfc17d011d5038b88ca93,d78f53d0856617bc782f02c3280dfef2,e5cfcc0a43c82072aca11628ff0add53,20ad8785a30f125bee5a8a325782ab06
13803,13804,0,80.0,1.0,1,,1.0,,1.0,d86102dd9c232bade9a97dccad40df48,7d76599bd27ff9e7823b2b1323ca763e,c5fe8848b6ab39b040cdb3668aea9433,b3eab50ccf6ffb51c37d36ee384abfbf
13804,13805,0,8.0,1.0,1,,,,1.0,3300cf6f774b7c6a5807110f244cbc21,689cf8289e7ea0b2eef1b017dcdfe8de,8b712435430a6875839a6c3b5a40b008,2b4165444a777465576b25f65697d739


In [38]:
# train_log_df と test_log_df を結合
log_df = pd.concat([train_log_df, test_log_df], axis=0, ignore_index=True)
# yado_df と結合
merged_df = log_df.merge(yado_df, on='yad_no', how='left')


# 各宿の人気度を計算（例：訪問回数で計算）
yad_popularity = merged_df['yad_no'].value_counts().reset_index()
yad_popularity.columns = ['yad_no', 'popularity']

# 全体での人気度ランキング
yad_popularity['overall_rank'] = yad_popularity['popularity'].rank(ascending=False)
# yado_df に人気度をマージ
yado_df = yado_df.merge(yad_popularity, on='yad_no', how='left')
# エリアごとの人気度ランキングを計算
for area in ['wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd']:
    yado_df[f'{area}_rank'] = yado_df.groupby(area)['popularity'].rank(ascending=False, method='min')

# 結果の確認
print(yado_df.head())

   yad_no  yad_type  total_room_cnt  wireless_lan_flg  onsen_flg  kd_stn_5min  \
0       1         0           129.0               1.0          0          1.0   
1       2         0            23.0               1.0          0          NaN   
2       3         0           167.0               1.0          1          1.0   
3       4         0           144.0               1.0          0          1.0   
4       5         0            41.0               1.0          1          NaN   

   kd_bch_5min  kd_slp_5min  kd_conv_walk_5min  \
0          NaN          NaN                1.0   
1          NaN          NaN                NaN   
2          NaN          NaN                1.0   
3          NaN          NaN                1.0   
4          NaN          NaN                NaN   

                             wid_cd                            ken_cd  \
0  f0112abf369fb03cdc5f5309300913da  072c85e1653e10c9c7dd065ad007125a   
1  d86102dd9c232bade9a97dccad40df48  b4d2fb4e51ea7bca80eb1270aa474

### seen_yado == 1のみに絞る

In [39]:
# train_dfとtest_dfからseen_yadが1の行のみ抽出
train_df = train_df[train_df['seen_yad'] == 1].reset_index(drop=True)
test_df = test_df[test_df['seen_yad'] == 1].reset_index(drop=True) 

In [40]:
train_df

Unnamed: 0,session_id,yado_no,target,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag
0,000104bdffaaad1a1e0a9ebacf585f33,96,1,0,1,1,1,0,0
1,00026fd325b5d65d18e6de78ea2a3751,756,1,0,1,1,1,0,0
2,0003439cbd15fa2463d0e97d56dadf8e,143,1,0,1,1,1,0,0
3,0003948318658b2072bc29e99415743e,569,1,0,1,1,1,0,0
4,00044db9da5da40b1e0056ba487cdc28,1383,1,0,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...
109448,fffe8a472ae6a96c9da05a30ac3ed6c5,4353,1,0,1,2,2,1,0
109449,fffe8a472ae6a96c9da05a30ac3ed6c5,11321,0,1,1,2,1,0,0
109450,fffe8c99c5b332190c3d4a2d6e7c5073,6170,1,0,1,1,1,0,0
109451,ffffcd5bc19d62cad5a3815c87818d83,10619,1,1,1,2,1,0,0


In [41]:
test_df

Unnamed: 0,session_id,yado_no,target,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag
0,00001149e9c73985425197104712478c,3560,0,0,1,1,1,0,0
1,0000f17ae2628237d78d3a38b009d3be,757,0,0,1,1,1,0,0
2,000174a6f7a569b84c5575760d2e9664,12341,0,1,1,2,1,0,0
3,0002f6aa27bcf984eeb3cf07297a96a9,10904,0,0,1,1,1,0,0
4,0003f18c0c221438a9f90a5f6a4e9330,12986,0,1,1,2,1,0,0
...,...,...,...,...,...,...,...,...,...
63391,fff922e03a34e95734f19373a820a9e4,8827,0,0,1,1,1,0,0
63392,fffa7fb9ab0d006e3cc51b3b37d11e0b,3720,0,0,1,1,1,0,0
63393,fffa9f78f4fea69da07d47dd810913a1,4834,0,0,1,1,1,0,0
63394,fffd40e7248796057c7f660c10abe336,11561,0,0,1,1,1,0,0


In [42]:
#　train_dfとtest_dfにyado_dfをマージ
train_df = pd.merge(train_df, yado_df.rename(columns={'yad_no': 'yado_no'}), on='yado_no', how='left')
test_df = pd.merge(test_df, yado_df.rename(columns={'yad_no': 'yado_no'}), on='yado_no', how='left')

In [43]:
train_df

Unnamed: 0,session_id,yado_no,target,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd,popularity,overall_rank,wid_cd_rank,ken_cd_rank,lrg_cd_rank,sml_cd_rank
0,000104bdffaaad1a1e0a9ebacf585f33,96,1,0,1,1,1,0,0,0,228.0,1.0,0,,,,1.0,e9316013ee1b03f4525fe361c46ce9c5,84efa50e52f9b471c95bfc3b21b854ad,a1370d90ed3b80ee41311bbbab46aec9,d72674f02c5340d90f245e3177727650,150.0,845.5,113.0,10.0,6.0,6.0
1,00026fd325b5d65d18e6de78ea2a3751,756,1,0,1,1,1,0,0,0,,,1,,,,,321b69d5eec98fe6253e26b86058e6a9,39c3eb151762dd35cf50e8bde404ae74,ef4accaed5974fd09e429a1901457578,9fbbf71e784c6b49e4244cde0a944c22,17.0,8132.5,270.0,72.0,19.0,2.0
2,0003439cbd15fa2463d0e97d56dadf8e,143,1,0,1,1,1,0,0,0,163.0,1.0,0,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,572d60f0f5212aacda515ebf81fb0a3a,2e63024b11908f3729510051a6fc7d9e,d075eb4a9669452b8f07cfc0d13a03ab,208.0,378.0,123.0,33.0,1.0,1.0
3,0003948318658b2072bc29e99415743e,569,1,0,1,1,1,0,0,0,103.0,1.0,0,,,,1.0,e9316013ee1b03f4525fe361c46ce9c5,66c4d01ad8e30155582cf80b1d655986,7763c74e2efa67a522125d9d3d7dde25,084c46af580a4871ae2a4921addf6e8e,146.0,902.0,119.0,59.0,49.0,30.0
4,00044db9da5da40b1e0056ba487cdc28,1383,1,0,1,1,1,0,0,0,124.0,1.0,0,1.0,,,1.0,8a1c0d3243bba111cbcd1ec6c692dc6d,ce83563814cff3080c8ae076f44b3020,e4fe3adf995513598a9272929559bc35,2fb1f3eefbe276774a8a3911bcc7e767,118.0,1458.5,59.0,33.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109448,fffe8a472ae6a96c9da05a30ac3ed6c5,4353,1,0,1,2,2,1,0,0,228.0,1.0,0,,,,,e9316013ee1b03f4525fe361c46ce9c5,517061b8165aa6370d9025951a64aa52,7e5ebb4d5e3cdfd3ad3798c9864d87d6,e5680545edc53d20bb05168e67c9f9f0,310.0,125.5,13.0,4.0,3.0,3.0
109449,fffe8a472ae6a96c9da05a30ac3ed6c5,11321,0,1,1,2,1,0,0,0,377.0,1.0,0,,,,,e9316013ee1b03f4525fe361c46ce9c5,517061b8165aa6370d9025951a64aa52,7e5ebb4d5e3cdfd3ad3798c9864d87d6,e5680545edc53d20bb05168e67c9f9f0,336.0,100.5,7.0,2.0,2.0,2.0
109450,fffe8c99c5b332190c3d4a2d6e7c5073,6170,1,0,1,1,1,0,0,0,58.0,1.0,0,,,,1.0,43875109d1dab93592812c50d18270a7,7d3db9a7acad537c322f85f7cef0beda,34f448c0371e84f8fe5a079d8d04211e,97de2b919a8ec19e656e5913804b0c84,77.0,2790.0,88.0,8.0,8.0,7.0
109451,ffffcd5bc19d62cad5a3815c87818d83,10619,1,1,1,2,1,0,0,0,,1.0,0,,,,1.0,321b69d5eec98fe6253e26b86058e6a9,a2b54b288d51bb19085ed1d99c428397,0c92ce61d0bf83edefee7eea279a15c8,de9c306d6999d60160eaf17cdb20fe47,36.0,5594.5,168.0,43.0,23.0,21.0


#### 各種Encoding

In [44]:
# カテゴリ変数と数値変数の明確化
cat_cols = ['wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd']

# 欠損値を-1で埋める
train_df.fillna(-1, inplace=True)
test_df.fillna(-1, inplace=True)

In [45]:
# Label Encoding

# testにしか存在しないものはNullにするので、trainのみをencoderにする
for col in cat_cols:
    encoder = {c: i for i, c in enumerate(train_df[col].unique())}
    train_df[f'label_{col}'] = train_df[col].map(encoder)
    test_df[f'label_{col}'] = test_df[col].map(encoder)

In [46]:
# Count Encoding（trainとtestをマージした方が良い？）←　した
# 増やした方が良い？

cat_cols = ['wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd'] + ['yado_no']

# testにしか存在しないものはNullにするので、trainのみをencoderにする
for col in cat_cols:
    encoder = train_df[col].value_counts()
    train_df[f'count_{col}'] = train_df[col].map(encoder)
    test_df[f'count_{col}'] = test_df[col].map(encoder)

In [47]:
# Target encoding
# 元の列順が保持されないから

cat_cols = ['wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd'] + ['yado_no']

# GroupKFold の設定  StratifiedGrouoKFoldでもいいかもしれない
gkf = GroupKFold(n_splits=5)

# fold 列を初期化
train_df['fold'] = -1

# 各 Fold に対してインデックスを割り当て
for fold, (train_idx, val_idx) in enumerate(gkf.split(train_df, groups=train_df['session_id'])):
    # バリデーションセットのインデックスに Fold 番号を割り当て
    train_df.loc[val_idx, 'fold'] = fold
# 結果の確認
print(train_df.head())



# TargetEncodingが完了した検証用データセットを格納するリスト
encoded_dfs = []

# すべての分割についてのループ
for fold in range(5):

  # 学習用と検証用データセットに分割する
  df_train = train_df[train_df.fold != fold].reset_index(drop=True)
  df_valid = train_df[train_df.fold == fold].reset_index(drop=True)
  
  # すべてのカテゴリについてのループ
  for column in cat_cols:
    # カテゴリごとの目的変数の統計量についての辞書を作成
    stats = df_train.groupby(column)['target'].agg(['mean', 'var'])
    
    # 平均、分散、最大値、最小値に対する列を作成
    for stat in ['mean', 'var']:
      mapping_dict = dict(stats[stat])
      df_valid.loc[:, f'TE_{column}_{stat}'] = df_valid[column].map(mapping_dict)

  # リストに格納
  encoded_dfs.append(df_valid)

# 結合したデータセットを返す
encoded_df = pd.concat(encoded_dfs, axis=0)


                         session_id  yado_no  target  seq_no  seen_yad  \
0  000104bdffaaad1a1e0a9ebacf585f33       96       1       0         1   
1  00026fd325b5d65d18e6de78ea2a3751      756       1       0         1   
2  0003439cbd15fa2463d0e97d56dadf8e      143       1       0         1   
3  0003948318658b2072bc29e99415743e      569       1       0         1   
4  00044db9da5da40b1e0056ba487cdc28     1383       1       0         1   

   max_seq_no  diff_seq_no  is_odd  multiple_visits_flag  yad_type  \
0           1            1       0                     0         0   
1           1            1       0                     0         0   
2           1            1       0                     0         0   
3           1            1       0                     0         0   
4           1            1       0                     0         0   

   total_room_cnt  wireless_lan_flg  onsen_flg  kd_stn_5min  kd_bch_5min  \
0           228.0               1.0          0         -1.

In [48]:
train_df['fold'].value_counts()

2    21891
1    21891
0    21891
3    21890
4    21890
Name: fold, dtype: int64

In [49]:
encoded_df['fold'].value_counts()

0    21891
1    21891
2    21891
3    21890
4    21890
Name: fold, dtype: int64

In [50]:
train_df = encoded_df

In [51]:
# Testデータに対しても同様にTargetEncodingを行う
cat_cols = ['wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd'] + ['yado_no']

# すべてのカテゴリについてのループ
# train_df全体で計算する
for column in cat_cols:
    # カテゴリごとの目的変数の統計量についての辞書を作成
    stats = train_df.groupby(column)['target'].agg(['mean', 'var'])
    
    # 平均、分散、最大値、最小値に対する列を作成
    for stat in ['mean', 'var']:
        mapping_dict = dict(stats[stat])
        test_df.loc[:, f'TE_{column}_{stat}'] = test_df[column].map(mapping_dict)


In [52]:
test_df

Unnamed: 0,session_id,yado_no,target,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd,popularity,overall_rank,wid_cd_rank,ken_cd_rank,lrg_cd_rank,sml_cd_rank,label_wid_cd,label_ken_cd,label_lrg_cd,label_sml_cd,count_wid_cd,count_ken_cd,count_lrg_cd,count_sml_cd,count_yado_no,TE_wid_cd_mean,TE_wid_cd_var,TE_ken_cd_mean,TE_ken_cd_var,TE_lrg_cd_mean,TE_lrg_cd_var,TE_sml_cd_mean,TE_sml_cd_var,TE_yado_no_mean,TE_yado_no_var
0,00001149e9c73985425197104712478c,3560,0,0,1,1,1,0,0,0,205.0,1.0,0,-1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,86.0,2418.5,588.0,339.0,22.0,10.0,2,16,33,36,29070,12836,931,321,14.0,0.728277,0.197897,0.817077,0.149474,0.738990,0.193091,0.887850,0.099883,0.928571,0.071429
1,0000f17ae2628237d78d3a38b009d3be,757,0,0,1,1,1,0,0,0,174.0,1.0,0,1.0,-1.0,-1.0,1.0,f0112abf369fb03cdc5f5309300913da,bd054cc265d68a400ccb976ac69c6463,dca13b5f308a0ae88ab8875a9ab56919,3267093e6bcad4a46af9d3e46350b22f,135.0,1096.5,163.0,47.0,3.0,3.0,8,11,121,141,16080,3505,258,209,20.0,0.789179,0.166386,0.842796,0.132529,0.798450,0.161554,0.822967,0.146393,0.800000,0.168421
2,000174a6f7a569b84c5575760d2e9664,12341,0,1,1,2,1,0,0,0,237.0,1.0,1,-1.0,-1.0,-1.0,1.0,d86102dd9c232bade9a97dccad40df48,acb04522bdbc0582bf16a6a97567cc27,57b6663bea1ed3527b11e80be82d5235,a5e1136fd4ebaa1dd055df0a44841538,202.0,409.0,16.0,2.0,1.0,1.0,9,24,36,39,6415,1126,376,376,33.0,0.840062,0.134379,0.812611,0.152410,0.816489,0.150234,0.816489,0.150234,0.757576,0.189394
3,0002f6aa27bcf984eeb3cf07297a96a9,10904,0,0,1,1,1,0,0,0,56.0,1.0,1,-1.0,-1.0,-1.0,-1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,d944a4adb81c65f0392a8a273606ba72,0af2cfba7130b2b6ab8c227d3d948156,23.0,7189.5,1434.0,819.0,5.0,5.0,2,16,181,238,29070,12836,28,28,1.0,0.728277,0.197897,0.817077,0.149474,0.964286,0.035714,0.964286,0.035714,1.000000,
4,0003f18c0c221438a9f90a5f6a4e9330,12986,0,1,1,2,1,0,0,0,201.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,c86352f5b57e80fe545cfec1fd8505a1,9d6a46da05976cab8ac2b8583215c665,568887ea1e1d8c3cf3c60b5be585aa6d,345.0,96.0,49.0,14.0,10.0,7.0,2,6,7,7,29070,6716,3470,2561,83.0,0.728277,0.197897,0.727367,0.198334,0.693084,0.212780,0.732917,0.195826,0.891566,0.097855
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63391,fff922e03a34e95734f19373a820a9e4,8827,0,0,1,1,1,0,0,0,203.0,1.0,0,1.0,-1.0,-1.0,1.0,f0112abf369fb03cdc5f5309300913da,072c85e1653e10c9c7dd065ad007125a,c918c473df6f5a01b3f8eb068530b020,efbb33bc0a3ce2c98a9d4ea7e8ff278e,110.0,1648.0,229.0,118.0,4.0,1.0,8,13,54,184,16080,8185,337,128,18.0,0.789179,0.166386,0.725718,0.199076,0.679525,0.218419,0.710938,0.207124,0.888889,0.104575
63392,fffa7fb9ab0d006e3cc51b3b37d11e0b,3720,0,0,1,1,1,0,0,0,105.0,1.0,0,-1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,3a6cd37aa9e38fd96d9dafc2615643d0,4683b842facadc1ad7161f72220f6a3e,111.0,1621.5,425.0,229.0,35.0,9.0,2,16,42,224,29070,12836,2169,406,13.0,0.728277,0.197897,0.817077,0.149474,0.753804,0.185669,0.854680,0.124509,0.846154,0.141026
63393,fffa9f78f4fea69da07d47dd810913a1,4834,0,0,1,1,1,0,0,0,159.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,aabf8b3cf6414702adfda5532705759a,da32f7dee35dbf732876e93f9cff9829,156.0,766.0,207.0,99.0,9.0,5.0,2,16,21,187,29070,12836,1090,485,18.0,0.728277,0.197897,0.817077,0.149474,0.822936,0.145846,0.890722,0.097538,0.777778,0.183007
63394,fffd40e7248796057c7f660c10abe336,11561,0,0,1,1,1,0,0,0,195.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,195.0,446.0,138.0,59.0,5.0,1.0,2,16,33,36,29070,12836,931,321,38.0,0.728277,0.197897,0.817077,0.149474,0.738990,0.193091,0.887850,0.099883,0.684211,0.221906


In [53]:
all_features = train_df.columns.to_list()
features= [x for x in all_features if x not in ('session_id', 'fold', 'target', 'wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd')]

In [54]:
train_df[features]

Unnamed: 0,yado_no,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,popularity,overall_rank,wid_cd_rank,ken_cd_rank,lrg_cd_rank,sml_cd_rank,label_wid_cd,label_ken_cd,label_lrg_cd,label_sml_cd,count_wid_cd,count_ken_cd,count_lrg_cd,count_sml_cd,count_yado_no,TE_wid_cd_mean,TE_wid_cd_var,TE_ken_cd_mean,TE_ken_cd_var,TE_lrg_cd_mean,TE_lrg_cd_var,TE_sml_cd_mean,TE_sml_cd_var,TE_yado_no_mean,TE_yado_no_var
0,5583,0,1,1,1,0,0,0,55.0,-1.0,1,-1.0,-1.0,-1.0,-1.0,48.0,4508.0,555.0,94.0,45.0,35.0,4,5,5,5,11663,1870,659,534,8,0.840099,0.134347,0.795591,0.162735,0.827715,0.142870,0.804598,0.157582,1.000000,0.000000
1,7178,0,1,1,1,0,0,0,206.0,1.0,0,1.0,-1.0,-1.0,1.0,235.0,268.0,6.0,5.0,4.0,4.0,1,7,8,8,3075,1578,1052,1023,60,0.837816,0.135936,0.838247,0.135697,0.815348,0.150737,0.815043,0.150934,0.680851,0.222017
2,12333,0,1,1,1,0,0,0,197.0,1.0,1,-1.0,-1.0,-1.0,1.0,33.0,5904.0,183.0,92.0,20.0,5.0,1,7,9,9,3075,1578,332,79,7,0.837816,0.135936,0.838247,0.135697,0.911111,0.081289,0.950000,0.048305,1.000000,0.000000
3,13079,0,1,1,1,0,0,0,261.0,1.0,0,1.0,-1.0,-1.0,1.0,248.0,232.0,21.0,21.0,5.0,4.0,5,8,12,12,7932,7932,2151,1843,42,0.826593,0.143359,0.826593,0.143359,0.932370,0.063093,0.930801,0.064455,0.896552,0.096059
4,12223,0,1,1,1,0,0,0,42.0,1.0,0,-1.0,-1.0,-1.0,1.0,61.0,3635.0,292.0,35.0,2.0,2.0,6,12,16,16,8853,924,87,78,16,0.818156,0.148798,0.840456,0.134281,0.909091,0.084175,0.897959,0.093537,0.700000,0.233333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21885,7681,1,1,2,1,0,0,0,96.0,1.0,0,1.0,-1.0,-1.0,1.0,202.0,409.0,44.0,28.0,28.0,13.0,4,27,47,92,11663,4587,3247,1375,47,0.837864,0.135863,0.817434,0.149276,0.824913,0.144487,0.726201,0.199013,0.736842,0.199147
21886,11634,0,1,1,1,0,0,0,215.0,1.0,0,1.0,-1.0,-1.0,1.0,151.0,832.5,18.0,8.0,8.0,5.0,10,21,32,35,3132,1046,761,501,27,0.866100,0.116017,0.879268,0.106285,0.889267,0.098639,0.908136,0.083644,0.727273,0.207792
21887,3473,0,1,1,1,0,0,0,126.0,1.0,0,1.0,-1.0,-1.0,1.0,122.0,1361.0,143.0,65.0,1.0,1.0,4,27,105,118,11663,4587,201,201,26,0.837864,0.135863,0.817434,0.149276,0.790850,0.166495,0.790850,0.166495,0.681818,0.227273
21888,3634,0,1,1,1,0,0,0,158.0,1.0,0,-1.0,-1.0,-1.0,1.0,65.0,3387.5,477.0,74.0,39.0,32.0,8,14,89,99,16080,2438,1337,1012,13,0.788918,0.166539,0.871677,0.111914,0.839038,0.135178,0.812346,0.152629,1.000000,0.000000


In [55]:
test_df[features]

Unnamed: 0,yado_no,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,popularity,overall_rank,wid_cd_rank,ken_cd_rank,lrg_cd_rank,sml_cd_rank,label_wid_cd,label_ken_cd,label_lrg_cd,label_sml_cd,count_wid_cd,count_ken_cd,count_lrg_cd,count_sml_cd,count_yado_no,TE_wid_cd_mean,TE_wid_cd_var,TE_ken_cd_mean,TE_ken_cd_var,TE_lrg_cd_mean,TE_lrg_cd_var,TE_sml_cd_mean,TE_sml_cd_var,TE_yado_no_mean,TE_yado_no_var
0,3560,0,1,1,1,0,0,0,205.0,1.0,0,-1.0,-1.0,-1.0,1.0,86.0,2418.5,588.0,339.0,22.0,10.0,2,16,33,36,29070,12836,931,321,14.0,0.728277,0.197897,0.817077,0.149474,0.738990,0.193091,0.887850,0.099883,0.928571,0.071429
1,757,0,1,1,1,0,0,0,174.0,1.0,0,1.0,-1.0,-1.0,1.0,135.0,1096.5,163.0,47.0,3.0,3.0,8,11,121,141,16080,3505,258,209,20.0,0.789179,0.166386,0.842796,0.132529,0.798450,0.161554,0.822967,0.146393,0.800000,0.168421
2,12341,1,1,2,1,0,0,0,237.0,1.0,1,-1.0,-1.0,-1.0,1.0,202.0,409.0,16.0,2.0,1.0,1.0,9,24,36,39,6415,1126,376,376,33.0,0.840062,0.134379,0.812611,0.152410,0.816489,0.150234,0.816489,0.150234,0.757576,0.189394
3,10904,0,1,1,1,0,0,0,56.0,1.0,1,-1.0,-1.0,-1.0,-1.0,23.0,7189.5,1434.0,819.0,5.0,5.0,2,16,181,238,29070,12836,28,28,1.0,0.728277,0.197897,0.817077,0.149474,0.964286,0.035714,0.964286,0.035714,1.000000,
4,12986,1,1,2,1,0,0,0,201.0,1.0,0,1.0,-1.0,-1.0,1.0,345.0,96.0,49.0,14.0,10.0,7.0,2,6,7,7,29070,6716,3470,2561,83.0,0.728277,0.197897,0.727367,0.198334,0.693084,0.212780,0.732917,0.195826,0.891566,0.097855
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63391,8827,0,1,1,1,0,0,0,203.0,1.0,0,1.0,-1.0,-1.0,1.0,110.0,1648.0,229.0,118.0,4.0,1.0,8,13,54,184,16080,8185,337,128,18.0,0.789179,0.166386,0.725718,0.199076,0.679525,0.218419,0.710938,0.207124,0.888889,0.104575
63392,3720,0,1,1,1,0,0,0,105.0,1.0,0,-1.0,-1.0,-1.0,1.0,111.0,1621.5,425.0,229.0,35.0,9.0,2,16,42,224,29070,12836,2169,406,13.0,0.728277,0.197897,0.817077,0.149474,0.753804,0.185669,0.854680,0.124509,0.846154,0.141026
63393,4834,0,1,1,1,0,0,0,159.0,1.0,0,1.0,-1.0,-1.0,1.0,156.0,766.0,207.0,99.0,9.0,5.0,2,16,21,187,29070,12836,1090,485,18.0,0.728277,0.197897,0.817077,0.149474,0.822936,0.145846,0.890722,0.097538,0.777778,0.183007
63394,11561,0,1,1,1,0,0,0,195.0,1.0,0,1.0,-1.0,-1.0,1.0,195.0,446.0,138.0,59.0,5.0,1.0,2,16,33,36,29070,12836,931,321,38.0,0.728277,0.197897,0.817077,0.149474,0.738990,0.193091,0.887850,0.099883,0.684211,0.221906


In [56]:
train_df.columns.to_list()

['session_id',
 'yado_no',
 'target',
 'seq_no',
 'seen_yad',
 'max_seq_no',
 'diff_seq_no',
 'is_odd',
 'multiple_visits_flag',
 'yad_type',
 'total_room_cnt',
 'wireless_lan_flg',
 'onsen_flg',
 'kd_stn_5min',
 'kd_bch_5min',
 'kd_slp_5min',
 'kd_conv_walk_5min',
 'wid_cd',
 'ken_cd',
 'lrg_cd',
 'sml_cd',
 'popularity',
 'overall_rank',
 'wid_cd_rank',
 'ken_cd_rank',
 'lrg_cd_rank',
 'sml_cd_rank',
 'label_wid_cd',
 'label_ken_cd',
 'label_lrg_cd',
 'label_sml_cd',
 'count_wid_cd',
 'count_ken_cd',
 'count_lrg_cd',
 'count_sml_cd',
 'count_yado_no',
 'fold',
 'TE_wid_cd_mean',
 'TE_wid_cd_var',
 'TE_ken_cd_mean',
 'TE_ken_cd_var',
 'TE_lrg_cd_mean',
 'TE_lrg_cd_var',
 'TE_sml_cd_mean',
 'TE_sml_cd_var',
 'TE_yado_no_mean',
 'TE_yado_no_var']

In [57]:
import gc
gc.collect()

105

In [58]:
test_df

Unnamed: 0,session_id,yado_no,target,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd,popularity,overall_rank,wid_cd_rank,ken_cd_rank,lrg_cd_rank,sml_cd_rank,label_wid_cd,label_ken_cd,label_lrg_cd,label_sml_cd,count_wid_cd,count_ken_cd,count_lrg_cd,count_sml_cd,count_yado_no,TE_wid_cd_mean,TE_wid_cd_var,TE_ken_cd_mean,TE_ken_cd_var,TE_lrg_cd_mean,TE_lrg_cd_var,TE_sml_cd_mean,TE_sml_cd_var,TE_yado_no_mean,TE_yado_no_var
0,00001149e9c73985425197104712478c,3560,0,0,1,1,1,0,0,0,205.0,1.0,0,-1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,86.0,2418.5,588.0,339.0,22.0,10.0,2,16,33,36,29070,12836,931,321,14.0,0.728277,0.197897,0.817077,0.149474,0.738990,0.193091,0.887850,0.099883,0.928571,0.071429
1,0000f17ae2628237d78d3a38b009d3be,757,0,0,1,1,1,0,0,0,174.0,1.0,0,1.0,-1.0,-1.0,1.0,f0112abf369fb03cdc5f5309300913da,bd054cc265d68a400ccb976ac69c6463,dca13b5f308a0ae88ab8875a9ab56919,3267093e6bcad4a46af9d3e46350b22f,135.0,1096.5,163.0,47.0,3.0,3.0,8,11,121,141,16080,3505,258,209,20.0,0.789179,0.166386,0.842796,0.132529,0.798450,0.161554,0.822967,0.146393,0.800000,0.168421
2,000174a6f7a569b84c5575760d2e9664,12341,0,1,1,2,1,0,0,0,237.0,1.0,1,-1.0,-1.0,-1.0,1.0,d86102dd9c232bade9a97dccad40df48,acb04522bdbc0582bf16a6a97567cc27,57b6663bea1ed3527b11e80be82d5235,a5e1136fd4ebaa1dd055df0a44841538,202.0,409.0,16.0,2.0,1.0,1.0,9,24,36,39,6415,1126,376,376,33.0,0.840062,0.134379,0.812611,0.152410,0.816489,0.150234,0.816489,0.150234,0.757576,0.189394
3,0002f6aa27bcf984eeb3cf07297a96a9,10904,0,0,1,1,1,0,0,0,56.0,1.0,1,-1.0,-1.0,-1.0,-1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,d944a4adb81c65f0392a8a273606ba72,0af2cfba7130b2b6ab8c227d3d948156,23.0,7189.5,1434.0,819.0,5.0,5.0,2,16,181,238,29070,12836,28,28,1.0,0.728277,0.197897,0.817077,0.149474,0.964286,0.035714,0.964286,0.035714,1.000000,
4,0003f18c0c221438a9f90a5f6a4e9330,12986,0,1,1,2,1,0,0,0,201.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,c86352f5b57e80fe545cfec1fd8505a1,9d6a46da05976cab8ac2b8583215c665,568887ea1e1d8c3cf3c60b5be585aa6d,345.0,96.0,49.0,14.0,10.0,7.0,2,6,7,7,29070,6716,3470,2561,83.0,0.728277,0.197897,0.727367,0.198334,0.693084,0.212780,0.732917,0.195826,0.891566,0.097855
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63391,fff922e03a34e95734f19373a820a9e4,8827,0,0,1,1,1,0,0,0,203.0,1.0,0,1.0,-1.0,-1.0,1.0,f0112abf369fb03cdc5f5309300913da,072c85e1653e10c9c7dd065ad007125a,c918c473df6f5a01b3f8eb068530b020,efbb33bc0a3ce2c98a9d4ea7e8ff278e,110.0,1648.0,229.0,118.0,4.0,1.0,8,13,54,184,16080,8185,337,128,18.0,0.789179,0.166386,0.725718,0.199076,0.679525,0.218419,0.710938,0.207124,0.888889,0.104575
63392,fffa7fb9ab0d006e3cc51b3b37d11e0b,3720,0,0,1,1,1,0,0,0,105.0,1.0,0,-1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,3a6cd37aa9e38fd96d9dafc2615643d0,4683b842facadc1ad7161f72220f6a3e,111.0,1621.5,425.0,229.0,35.0,9.0,2,16,42,224,29070,12836,2169,406,13.0,0.728277,0.197897,0.817077,0.149474,0.753804,0.185669,0.854680,0.124509,0.846154,0.141026
63393,fffa9f78f4fea69da07d47dd810913a1,4834,0,0,1,1,1,0,0,0,159.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,aabf8b3cf6414702adfda5532705759a,da32f7dee35dbf732876e93f9cff9829,156.0,766.0,207.0,99.0,9.0,5.0,2,16,21,187,29070,12836,1090,485,18.0,0.728277,0.197897,0.817077,0.149474,0.822936,0.145846,0.890722,0.097538,0.777778,0.183007
63394,fffd40e7248796057c7f660c10abe336,11561,0,0,1,1,1,0,0,0,195.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,195.0,446.0,138.0,59.0,5.0,1.0,2,16,33,36,29070,12836,931,321,38.0,0.728277,0.197897,0.817077,0.149474,0.738990,0.193091,0.887850,0.099883,0.684211,0.221906


In [59]:
train_df

Unnamed: 0,session_id,yado_no,target,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd,popularity,overall_rank,wid_cd_rank,ken_cd_rank,lrg_cd_rank,sml_cd_rank,label_wid_cd,label_ken_cd,label_lrg_cd,label_sml_cd,count_wid_cd,count_ken_cd,count_lrg_cd,count_sml_cd,count_yado_no,fold,TE_wid_cd_mean,TE_wid_cd_var,TE_ken_cd_mean,TE_ken_cd_var,TE_lrg_cd_mean,TE_lrg_cd_var,TE_sml_cd_mean,TE_sml_cd_var,TE_yado_no_mean,TE_yado_no_var
0,00063c614aca4920b965558ff79dd6c8,5583,1,0,1,1,1,0,0,0,55.0,-1.0,1,-1.0,-1.0,-1.0,-1.0,dc414a17890cfc17d011d5038b88ca93,223938a74a609968141867c244086206,63083678169dddaf76ddb92c6c6fdf65,33bfe292401fc7f99b8b9831a71f61ee,48.0,4508.0,555.0,94.0,45.0,35.0,4,5,5,5,11663,1870,659,534,8,0,0.840099,0.134347,0.795591,0.162735,0.827715,0.142870,0.804598,0.157582,1.000000,0.000000
1,0009f793074d5edb78b4e7471e494c9a,7178,1,0,1,1,1,0,0,0,206.0,1.0,0,1.0,-1.0,-1.0,1.0,321b69d5eec98fe6253e26b86058e6a9,0745a2107686fcb724892ce52f19d02e,5f6ef5263d833db9eb585c0b685e6817,749baed21736073c6b7b624cc76932ea,235.0,268.0,6.0,5.0,4.0,4.0,1,7,8,8,3075,1578,1052,1023,60,0,0.837816,0.135936,0.838247,0.135697,0.815348,0.150737,0.815043,0.150934,0.680851,0.222017
2,000a472a87af3fc9c16e5232d37da15a,12333,1,0,1,1,1,0,0,0,197.0,1.0,1,-1.0,-1.0,-1.0,1.0,321b69d5eec98fe6253e26b86058e6a9,0745a2107686fcb724892ce52f19d02e,bd9ca9b95bfc526f6ae521d5f4442362,1ed4e9cbfda3e66bc84c1392d75ef036,33.0,5904.0,183.0,92.0,20.0,5.0,1,7,9,9,3075,1578,332,79,7,0,0.837816,0.135936,0.838247,0.135697,0.911111,0.081289,0.950000,0.048305,1.000000,0.000000
3,000bbc7ab01813d008039b2c9195fe01,13079,0,0,1,1,1,0,0,0,261.0,1.0,0,1.0,-1.0,-1.0,1.0,b07b75d367ebece55a23ceecc939fff4,0a66f6ab9c0507059da6f22a0e1f1690,9ab5718fd88c6e5f9fec37a51827d428,7aff71bb47acb796d425c5ed5e6dfb3f,248.0,232.0,21.0,21.0,5.0,4.0,5,8,12,12,7932,7932,2151,1843,42,0,0.826593,0.143359,0.826593,0.143359,0.932370,0.063093,0.930801,0.064455,0.896552,0.096059
4,000f033450b5d9524a0d8c23bebda53c,12223,1,0,1,1,1,0,0,0,42.0,1.0,0,-1.0,-1.0,-1.0,1.0,c312e07b7a5d456d53a5b00910a336e1,543ea7c443fdfd779bbd557f2ad99d80,ad1d0da421776af5808aa4cb03ef0c1e,dbbf6f4fc947c0e61503740df4deab4d,61.0,3635.0,292.0,35.0,2.0,2.0,6,12,16,16,8853,924,87,78,16,0,0.818156,0.148798,0.840456,0.134281,0.909091,0.084175,0.897959,0.093537,0.700000,0.233333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21885,fff3eb89a3d1d6fd7ef1706c6b7bb102,7681,1,1,1,2,1,0,0,0,96.0,1.0,0,1.0,-1.0,-1.0,1.0,dc414a17890cfc17d011d5038b88ca93,d78f53d0856617bc782f02c3280dfef2,e5cfcc0a43c82072aca11628ff0add53,e2f51242791849e72240784844876b89,202.0,409.0,44.0,28.0,28.0,13.0,4,27,47,92,11663,4587,3247,1375,47,4,0.837864,0.135863,0.817434,0.149276,0.824913,0.144487,0.726201,0.199013,0.736842,0.199147
21886,fff77a0e60976a6208d1f0a94702d6b8,11634,1,0,1,1,1,0,0,0,215.0,1.0,0,1.0,-1.0,-1.0,1.0,43875109d1dab93592812c50d18270a7,3b09a7ce9934c00fd266f9cf246308d1,c0e203c17b3a418d089e96aac011d1a1,a5a49a911ded94700d282a1d4ff3a373,151.0,832.5,18.0,8.0,8.0,5.0,10,21,32,35,3132,1046,761,501,27,4,0.866100,0.116017,0.879268,0.106285,0.889267,0.098639,0.908136,0.083644,0.727273,0.207792
21887,fff92a0a11b20f1a6eb3b83a94f917ec,3473,1,0,1,1,1,0,0,0,126.0,1.0,0,1.0,-1.0,-1.0,1.0,dc414a17890cfc17d011d5038b88ca93,d78f53d0856617bc782f02c3280dfef2,145f05d14ef5eaee02ec4752a4d20482,af3c48f954de7e56a15bb71aed201575,122.0,1361.0,143.0,65.0,1.0,1.0,4,27,105,118,11663,4587,201,201,26,4,0.837864,0.135863,0.817434,0.149276,0.790850,0.166495,0.790850,0.166495,0.681818,0.227273
21888,fffaafbf1b7c08a586cfe2a73eec1f48,3634,1,0,1,1,1,0,0,0,158.0,1.0,0,-1.0,-1.0,-1.0,1.0,f0112abf369fb03cdc5f5309300913da,ce3aaf25e7e38a0c42d373fb148efc86,b94b1624f29aceb6511babed280db4d7,8cb854e17cd42e2b44f0c603da4608d4,65.0,3387.5,477.0,74.0,39.0,32.0,8,14,89,99,16080,2438,1337,1012,13,4,0.788918,0.166539,0.871677,0.111914,0.839038,0.135178,0.812346,0.152629,1.000000,0.000000


In [60]:
train_df.to_parquet('../data/feature_engineering_v5_train_df.parquet', index=False)
test_df.to_parquet('../data/feature_engineering_v5_test_df.parquet', index=False)