# candidates用の特徴量エンジニアリング

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm


from sklearn.model_selection import KFold
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import StratifiedKFold
from nyaggle.feature.category_encoder import TargetEncoder

from nyaggle.experiment import run_experiment

#最大表示列数の指定（ここでは50列を指定）
pd.set_option('display.max_columns', 50)
# 最大表示行数の指定（ここでは50行を指定）
pd.set_option('display.max_rows', 500)

In [2]:
train_log_df = pd.read_csv('../train_log.csv')
test_log_df = pd.read_csv('../test_log.csv')
train_label_df = pd.read_csv('../train_label.csv')
test_session_df = pd.read_csv('../test_session.csv')
yado_df = pd.read_csv('../yado.csv')
# image_embeddings_df = pd.read_parquet('../image_embeddings.parquet')
sample_submission_df = pd.read_csv('../sample_submission.csv')

candidate_train_df = pd.read_csv('../data/candidate_ver15_train.csv')
candidate_test_df = pd.read_csv('../data/candidate_ver15_test.csv')

In [3]:
candidate_train_df

Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9
0,11882,2808,5289,4101,3324,12846,997,9207,9209,9208
1,8253,8747,2570,1586,11104,3725,4488,2259,3564,8225
2,9039,6722,7509,4355,4863,11724,5238,13642,1967,2957
3,626,755,11715,7812,109,2272,13296,1341,13549,7872
4,96,3894,7749,902,11380,5490,1284,12491,254,4072
...,...,...,...,...,...,...,...,...,...,...
288693,13210,13079,2876,3725,8677,13717,1586,10955,10522,5719
288694,8703,3940,399,4767,2900,6654,5299,8465,963,513
288695,7308,12240,4040,7820,4398,2087,9558,3566,844,10364
288696,10619,570,12500,11091,7551,3238,10616,12781,12829,11316


In [4]:
train_label_df

Unnamed: 0,session_id,yad_no
0,000007603d533d30453cc45d0f3d119f,4101
1,0000ca043ed437a1472c9d1d154eb49b,8253
2,0000d4835cf113316fe447e2f80ba1c8,4863
3,0000fcda1ae1b2f431e55a7075d1f500,1652
4,000104bdffaaad1a1e0a9ebacf585f33,96
...,...,...
288693,ffff2262d38abdeb247ebd591835dcc9,2259
288694,ffff2360540745117193ecadcdc06538,963
288695,ffff7fb4617164b2604aaf51c40bf82d,13719
288696,ffffcd5bc19d62cad5a3815c87818d83,10619


#### Trainに正例と負例のフラグを付与 & Predict列の数字を入れる

In [5]:
# Trainに正例と負例のフラグを付与

# 学習用データ
# 教師データに正例と負例のフラグを付与
train_target = []

# 各セッションに対して候補の宿をチェック
for index, row in tqdm(train_label_df.iterrows(), total=train_label_df.shape[0]):
    session_id = row['session_id']
    yad_no = row['yad_no']
    
    # candidate_train_df の該当行を取得（同じインデックスを仮定）
    candidate_row = candidate_train_df.iloc[index]

    # 予測候補の宿に対してチェック
    for col_index, col in enumerate(candidate_train_df.columns):
        yado_no = candidate_row[col]
        is_correct = (yado_no == yad_no)  # 一致するかどうかのチェック
        predict_no = col_index  # predict_列の数字を取得
        train_target.append([session_id, yado_no, is_correct, predict_no])

# 結果をDataFrameに変換
train_df = pd.DataFrame(train_target, columns=['session_id', 'yado_no', 'target', 'predict_no']) # 'predict_no' 列を追加
train_df['target'] = train_df['target'].astype(int)  # target列をint型に変換

# 最初の数行を表示して確認
print(train_df.head())

print('正例と負例の数を確認')
print(train_df['target'].value_counts())

100%|██████████| 288698/288698 [00:42<00:00, 6768.77it/s]


                         session_id  yado_no  target  predict_no
0  000007603d533d30453cc45d0f3d119f    11882       0           0
1  000007603d533d30453cc45d0f3d119f     2808       0           1
2  000007603d533d30453cc45d0f3d119f     5289       0           2
3  000007603d533d30453cc45d0f3d119f     4101       1           3
4  000007603d533d30453cc45d0f3d119f     3324       0           4
正例と負例の数を確認
0    2697380
1     189600
Name: target, dtype: int64


In [6]:
print(train_df['predict_no'].value_counts()) 

0    288698
1    288698
2    288698
3    288698
4    288698
5    288698
6    288698
7    288698
8    288698
9    288698
Name: predict_no, dtype: int64


#### 推論用データの作成

In [7]:
# 推論用データにダミーのyado_noを付与
test_session_df['yad_no'] = -1

test_target = []

# 各セッションに対して候補の宿をチェック
for index, row in tqdm(test_session_df.iterrows(), total=test_session_df.shape[0]):
    session_id = row['session_id']
    yad_no = row['yad_no']
    
    # candidate_test_df の該当行を取得（同じインデックスを仮定）
    candidate_row = candidate_test_df.iloc[index]

    # 予測候補の宿に対してチェック
    for col_index, col in enumerate(candidate_test_df.columns):
        yado_no = candidate_row[col]
        # 予測データでは、正解のyad_noがないため、常にFalseとなる
        is_correct = False  
        predict_no = col_index  # predict_列の数字を取得
        test_target.append([session_id, yado_no, is_correct, predict_no])

# 結果をDataFrameに変換
test_df = pd.DataFrame(test_target, columns=['session_id', 'yado_no', 'target', 'predict_no']) # 'predict_no' 列を追加
test_df['target'] = test_df['target'].astype(int)  # target列をint型に変換

# 最初の数行を表示して確認
print(test_df.head())

print('正例と負例の数を確認')
print(test_df['target'].value_counts()) # すべて0になるはず

100%|██████████| 174700/174700 [00:25<00:00, 6911.02it/s]


                         session_id  yado_no  target  predict_no
0  00001149e9c73985425197104712478c     3560       0           0
1  00001149e9c73985425197104712478c    11561       0           1
2  00001149e9c73985425197104712478c     4714       0           2
3  00001149e9c73985425197104712478c     2680       0           3
4  00001149e9c73985425197104712478c     4420       0           4
正例と負例の数を確認
0    1747000
Name: target, dtype: int64


In [8]:
print(test_df['predict_no'].value_counts())

0    174700
1    174700
2    174700
3    174700
4    174700
5    174700
6    174700
7    174700
8    174700
9    174700
Name: predict_no, dtype: int64


#### seq_noを追加

In [9]:
# seq_noをマージする

# train_log の session_id と yad_no の組み合わせに基づいて辞書を作成
seq_no_dict_train = dict(zip(zip(train_log_df['session_id'], train_log_df['yad_no']), train_log_df['seq_no']))
# train_df に seq_no 列を追加（tqdm で進捗表示）
train_df['seq_no'] = [seq_no_dict_train.get((row['session_id'], row['yado_no']), -1) for row in tqdm(train_df.to_dict('records'))]
# 結果の確認
print(train_df.head())


# test_log の session_id と yad_no の組み合わせに基づいて辞書を作成
seq_no_dict_test = dict(zip(zip(test_log_df['session_id'], test_log_df['yad_no']), test_log_df['seq_no']))
# test_df に seq_no 列を追加（tqdm で進捗表示）
test_df['seq_no'] = [seq_no_dict_test.get((row['session_id'], row['yado_no']), -1) for row in tqdm(test_df.to_dict('records'))]
# 結果の確認
print(test_df.head())

100%|██████████| 2886980/2886980 [00:01<00:00, 2383068.55it/s]


                         session_id  yado_no  target  predict_no  seq_no
0  000007603d533d30453cc45d0f3d119f    11882       0           0      -1
1  000007603d533d30453cc45d0f3d119f     2808       0           1      -1
2  000007603d533d30453cc45d0f3d119f     5289       0           2      -1
3  000007603d533d30453cc45d0f3d119f     4101       1           3      -1
4  000007603d533d30453cc45d0f3d119f     3324       0           4      -1


100%|██████████| 1747000/1747000 [00:00<00:00, 2521345.68it/s]


                         session_id  yado_no  target  predict_no  seq_no
0  00001149e9c73985425197104712478c     3560       0           0       0
1  00001149e9c73985425197104712478c    11561       0           1      -1
2  00001149e9c73985425197104712478c     4714       0           2      -1
3  00001149e9c73985425197104712478c     2680       0           3      -1
4  00001149e9c73985425197104712478c     4420       0           4      -1


In [10]:
train_df.head()

Unnamed: 0,session_id,yado_no,target,predict_no,seq_no
0,000007603d533d30453cc45d0f3d119f,11882,0,0,-1
1,000007603d533d30453cc45d0f3d119f,2808,0,1,-1
2,000007603d533d30453cc45d0f3d119f,5289,0,2,-1
3,000007603d533d30453cc45d0f3d119f,4101,1,3,-1
4,000007603d533d30453cc45d0f3d119f,3324,0,4,-1


In [11]:
train_df[train_df['session_id'] == '0007dd71a9a78c567084374a66e38139']

Unnamed: 0,session_id,yado_no,target,predict_no,seq_no
360,0007dd71a9a78c567084374a66e38139,2927,1,0,4
361,0007dd71a9a78c567084374a66e38139,6199,0,1,-1
362,0007dd71a9a78c567084374a66e38139,12089,0,2,-1
363,0007dd71a9a78c567084374a66e38139,12425,0,3,-1
364,0007dd71a9a78c567084374a66e38139,13386,0,4,-1
365,0007dd71a9a78c567084374a66e38139,11850,0,5,-1
366,0007dd71a9a78c567084374a66e38139,9137,0,6,-1
367,0007dd71a9a78c567084374a66e38139,12986,0,7,-1
368,0007dd71a9a78c567084374a66e38139,2452,0,8,-1
369,0007dd71a9a78c567084374a66e38139,2318,0,9,-1


In [12]:
train_label_df[train_label_df['session_id'] == '0007dd71a9a78c567084374a66e38139']

Unnamed: 0,session_id,yad_no
36,0007dd71a9a78c567084374a66e38139,2927


In [13]:
# 複数あると一番番号が高い4の判定になる
train_log_df[train_log_df['session_id'] == '0007dd71a9a78c567084374a66e38139']

Unnamed: 0,session_id,seq_no,yad_no
45,0007dd71a9a78c567084374a66e38139,0,2927
46,0007dd71a9a78c567084374a66e38139,1,11037
47,0007dd71a9a78c567084374a66e38139,2,2927
48,0007dd71a9a78c567084374a66e38139,3,11037
49,0007dd71a9a78c567084374a66e38139,4,2927
50,0007dd71a9a78c567084374a66e38139,5,11037


In [14]:
#　※複数のseq_noがある場合は、0も混入する
print(train_df['seq_no'].value_counts())
print(test_df['seq_no'].value_counts())

-1    2777527
 0      88128
 1      16052
 2       4125
 3        837
 4        223
 5         65
 6         18
 7          4
 8          1
Name: seq_no, dtype: int64
-1    1683604
 0      51884
 1       8692
 2       2239
 3        428
 4        124
 5         22
 6          7
Name: seq_no, dtype: int64


#### 候補の宿がsession中に閲覧されていたかどうか

In [15]:
# 'seq_no'が−1でなければ1のフラグを立てる
train_df['seen_yad'] = train_df['seq_no'].apply(lambda x: 1 if x != -1 else 0)
test_df['seen_yad'] = test_df['seq_no'].apply(lambda x: 1 if x != -1 else 0)

In [16]:
print(train_df['seen_yad'].value_counts())  
print(test_df['seen_yad'].value_counts())

0    2777527
1     109453
Name: seen_yad, dtype: int64
0    1683604
1      63396
Name: seen_yad, dtype: int64


In [17]:
train_df[train_df['seen_yad']==1].target.value_counts()

1    87693
0    21760
Name: target, dtype: int64

#### 各sessionにおける最大seq_no

In [18]:
# 各セッションの seq_no の最大値を計算
max_seq_no_train = train_log_df.groupby('session_id')['seq_no'].max().reset_index()
max_seq_no_train.rename(columns={'seq_no': 'max_seq_no'}, inplace=True)
# teacher_df と max_seq_no のマージ
train_df = train_df.merge(max_seq_no_train, on='session_id', how='left')
# 結果の確認
print(train_df.head())


# 各セッションの seq_no の最大値を計算
max_seq_no_test = test_log_df.groupby('session_id')['seq_no'].max().reset_index()
max_seq_no_test.rename(columns={'seq_no': 'max_seq_no'}, inplace=True)
# teacher_df と max_seq_no のマージ
test_df = test_df.merge(max_seq_no_test, on='session_id', how='left')
# 結果の確認
print(test_df.head())

                         session_id  yado_no  target  predict_no  seq_no  \
0  000007603d533d30453cc45d0f3d119f    11882       0           0      -1   
1  000007603d533d30453cc45d0f3d119f     2808       0           1      -1   
2  000007603d533d30453cc45d0f3d119f     5289       0           2      -1   
3  000007603d533d30453cc45d0f3d119f     4101       1           3      -1   
4  000007603d533d30453cc45d0f3d119f     3324       0           4      -1   

   seen_yad  max_seq_no  
0         0           0  
1         0           0  
2         0           0  
3         0           0  
4         0           0  
                         session_id  yado_no  target  predict_no  seq_no  \
0  00001149e9c73985425197104712478c     3560       0           0       0   
1  00001149e9c73985425197104712478c    11561       0           1      -1   
2  00001149e9c73985425197104712478c     4714       0           2      -1   
3  00001149e9c73985425197104712478c     2680       0           3      -1   
4  0000

In [21]:
print(train_df['max_seq_no'].value_counts())
print(test_df['max_seq_no'].value_counts())

0    1853860
1     827930
2     153500
3      40250
4       8330
5       2230
6        650
7        180
8         40
9         10
Name: max_seq_no, dtype: int64
0    1139400
1     494930
2      84590
3      22270
4       4280
5       1240
6        220
7         70
Name: max_seq_no, dtype: int64


#### 差分の考慮：max_seq_noから(-1以外の要素)でseq_noを引く

In [22]:
# 最初に全ての diff_seq_no を -1 に設定
train_df['diff_seq_no'] = -1
test_df['diff_seq_no'] = -1

# seq_no が -1 以外の行にのみ max_seq_no - seq_no の計算を適用
train_df.loc[train_df['seq_no'] != -1, 'diff_seq_no'] = train_df['max_seq_no'] - train_df['seq_no']
test_df.loc[test_df['seq_no'] != -1, 'diff_seq_no'] = test_df['max_seq_no'] - test_df['seq_no']

# 結果の確認
print(train_df.head())
print(test_df.head())

                         session_id  yado_no  target  predict_no  seq_no  \
0  000007603d533d30453cc45d0f3d119f    11882       0           0      -1   
1  000007603d533d30453cc45d0f3d119f     2808       0           1      -1   
2  000007603d533d30453cc45d0f3d119f     5289       0           2      -1   
3  000007603d533d30453cc45d0f3d119f     4101       1           3      -1   
4  000007603d533d30453cc45d0f3d119f     3324       0           4      -1   

   seen_yad  max_seq_no  diff_seq_no  
0         0           0           -1  
1         0           0           -1  
2         0           0           -1  
3         0           0           -1  
4         0           0           -1  
                         session_id  yado_no  target  predict_no  seq_no  \
0  00001149e9c73985425197104712478c     3560       0           0       0   
1  00001149e9c73985425197104712478c    11561       0           1      -1   
2  00001149e9c73985425197104712478c     4714       0           2      -1   
3  00

In [23]:
print(train_df['diff_seq_no'].value_counts())
print(test_df['diff_seq_no'].value_counts())

-1    2777527
 1     103312
 2       5684
 3        407
 4         49
 5          1
Name: diff_seq_no, dtype: int64
-1    1683604
 1      60760
 2       2528
 3        106
 4          2
Name: diff_seq_no, dtype: int64


#### diff_seq_no が奇数かどうかの判定 

In [24]:
# diff_seq_no が奇数かどうかの判定（seq_no が -1 の場合は除外）
train_df['is_odd'] = np.where(train_df['seq_no'] != -1, train_df['diff_seq_no'] % 2 == 1, -1)
test_df['is_odd'] = np.where(test_df['seq_no'] != -1, test_df['diff_seq_no'] % 2 == 1, -1)

# seq_no が -1 以外の場合、Trueを0、Falseを1に変換（すでに -1 の場合は変更しない）
train_df['is_odd'] = np.where(train_df['is_odd'] != -1, np.where(train_df['is_odd'], 0, 1), -1)
test_df['is_odd'] = np.where(test_df['is_odd'] != -1, np.where(test_df['is_odd'], 0, 1), -1)

# 結果の確認
print(train_df.head())
print(test_df.head())

                         session_id  yado_no  target  predict_no  seq_no  \
0  000007603d533d30453cc45d0f3d119f    11882       0           0      -1   
1  000007603d533d30453cc45d0f3d119f     2808       0           1      -1   
2  000007603d533d30453cc45d0f3d119f     5289       0           2      -1   
3  000007603d533d30453cc45d0f3d119f     4101       1           3      -1   
4  000007603d533d30453cc45d0f3d119f     3324       0           4      -1   

   seen_yad  max_seq_no  diff_seq_no  is_odd  
0         0           0           -1      -1  
1         0           0           -1      -1  
2         0           0           -1      -1  
3         0           0           -1      -1  
4         0           0           -1      -1  
                         session_id  yado_no  target  predict_no  seq_no  \
0  00001149e9c73985425197104712478c     3560       0           0       0   
1  00001149e9c73985425197104712478c    11561       0           1      -1   
2  00001149e9c7398542519710471247

In [27]:
print(train_df['is_odd'].value_counts())
print(test_df['is_odd'].value_counts())

-1    2777527
 0     103720
 1       5733
Name: is_odd, dtype: int64
-1    1683604
 0      60866
 1       2530
Name: is_odd, dtype: int64


#### 各sessionにおいて2回以上出現したyad_noがあれば1のフラグを立てる

In [28]:
# 各セッションにおいて2回以上登場する yad_no を特定
yad_no_counts = train_log_df.groupby(['session_id', 'yad_no']).size().reset_index(name='count')
multiple_visits = yad_no_counts[yad_no_counts['count'] > 1]

# (session_id, yad_no) のタプルをキーとして辞書を作成
multiple_visits_dict = {(row['session_id'], row['yad_no']): 1 for _, row in multiple_visits.iterrows()}

# teacher_df に multiple_visits_flag 列を追加（tqdm で進捗表示）
train_df['multiple_visits_flag'] = [multiple_visits_dict.get((row['session_id'], row['yado_no']), 0) 
                                      for row in tqdm(train_df.to_dict('records'))]

# 結果の確認
print(train_df.head())



# 各セッションにおいて2回以上登場する yad_no を特定
yad_no_counts = test_log_df.groupby(['session_id', 'yad_no']).size().reset_index(name='count')
multiple_visits = yad_no_counts[yad_no_counts['count'] > 1]

# (session_id, yad_no) のタプルをキーとして辞書を作成
multiple_visits_dict = {(row['session_id'], row['yad_no']): 1 for _, row in multiple_visits.iterrows()}

# teacher_df に multiple_visits_flag 列を追加（tqdm で進捗表示）
test_df['multiple_visits_flag'] = [multiple_visits_dict.get((row['session_id'], row['yado_no']), 0) 
                                      for row in tqdm(test_df.to_dict('records'))]

# 結果の確認
print(test_df.head())

100%|██████████| 2886980/2886980 [00:00<00:00, 2999906.79it/s]


                         session_id  yado_no  target  predict_no  seq_no  \
0  000007603d533d30453cc45d0f3d119f    11882       0           0      -1   
1  000007603d533d30453cc45d0f3d119f     2808       0           1      -1   
2  000007603d533d30453cc45d0f3d119f     5289       0           2      -1   
3  000007603d533d30453cc45d0f3d119f     4101       1           3      -1   
4  000007603d533d30453cc45d0f3d119f     3324       0           4      -1   

   seen_yad  max_seq_no  diff_seq_no  is_odd  multiple_visits_flag  
0         0           0           -1      -1                     0  
1         0           0           -1      -1                     0  
2         0           0           -1      -1                     0  
3         0           0           -1      -1                     0  
4         0           0           -1      -1                     0  


100%|██████████| 1747000/1747000 [00:00<00:00, 3148870.11it/s]


                         session_id  yado_no  target  predict_no  seq_no  \
0  00001149e9c73985425197104712478c     3560       0           0       0   
1  00001149e9c73985425197104712478c    11561       0           1      -1   
2  00001149e9c73985425197104712478c     4714       0           2      -1   
3  00001149e9c73985425197104712478c     2680       0           3      -1   
4  00001149e9c73985425197104712478c     4420       0           4      -1   

   seen_yad  max_seq_no  diff_seq_no  is_odd  multiple_visits_flag  
0         1           1            1       0                     0  
1         0           1           -1      -1                     0  
2         0           1           -1      -1                     0  
3         0           1           -1      -1                     0  
4         0           1           -1      -1                     0  


In [29]:
print(train_df['multiple_visits_flag'].value_counts())
print(test_df['multiple_visits_flag'].value_counts())

0    2882332
1       4648
Name: multiple_visits_flag, dtype: int64
0    1744359
1       2641
Name: multiple_visits_flag, dtype: int64


### yado_dfに関する特徴量エンジニアリング

In [33]:
yado_df

Unnamed: 0,yad_no,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd,popularity,overall_rank,wid_cd_rank,ken_cd_rank,lrg_cd_rank,sml_cd_rank
0,1,0,129.0,1.0,0,1.0,,,1.0,f0112abf369fb03cdc5f5309300913da,072c85e1653e10c9c7dd065ad007125a,449c52ef581d5f9ef311189469a0520e,677a32689cd1ad74e867f1fbe43a3e1c,30.0,6251.5,840.0,361.0,117.0,44.0
1,2,0,23.0,1.0,0,,,,,d86102dd9c232bade9a97dccad40df48,b4d2fb4e51ea7bca80eb1270aa474a54,5c9a8f48e9df0234da012747a02d4b29,4ee16ee838dd2703cc9a1d5a535f0ced,29.0,6365.5,406.0,127.0,29.0,29.0
2,3,0,167.0,1.0,1,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,572d60f0f5212aacda515ebf81fb0a3a,8a623b960557e87bd1f4edf71b6255be,ab9480fd72a44d51690ab16c4ad4d49c,210.0,369.5,121.0,32.0,31.0,1.0
3,4,0,144.0,1.0,0,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52c9ea83f2cfe92be54cb6bc961edf21,1cc3e1838bb0fd0fde0396130b1f82b9,67.0,3274.0,760.0,444.0,15.0,7.0
4,5,0,41.0,1.0,1,,,,,43875109d1dab93592812c50d18270a7,75617bb07a2785a948ab1958909211f1,9ea5a911019b66ccd42f556c42a2fe2f,be1b876af18afc4deeb3081591d2a910,30.0,6251.5,238.0,59.0,13.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13801,13802,0,10.0,1.0,1,,,,,c312e07b7a5d456d53a5b00910a336e1,558ac1909f0318b82c621ab250329d6d,80fb3c5ad0c89931d0923e9f80885218,5eb30820716082c720836733d73c605e,,,,,,
13802,13803,0,,,0,1.0,,,1.0,dc414a17890cfc17d011d5038b88ca93,d78f53d0856617bc782f02c3280dfef2,e5cfcc0a43c82072aca11628ff0add53,20ad8785a30f125bee5a8a325782ab06,39.0,5314.5,657.0,220.0,142.0,14.0
13803,13804,0,80.0,1.0,1,,1.0,,1.0,d86102dd9c232bade9a97dccad40df48,7d76599bd27ff9e7823b2b1323ca763e,c5fe8848b6ab39b040cdb3668aea9433,b3eab50ccf6ffb51c37d36ee384abfbf,90.0,2271.0,128.0,13.0,6.0,6.0
13804,13805,0,8.0,1.0,1,,,,1.0,3300cf6f774b7c6a5807110f244cbc21,689cf8289e7ea0b2eef1b017dcdfe8de,8b712435430a6875839a6c3b5a40b008,2b4165444a777465576b25f65697d739,4.0,11568.0,673.0,169.0,32.0,5.0


In [31]:
# train_log_df と test_log_df を結合
log_df = pd.concat([train_log_df, test_log_df], axis=0, ignore_index=True)
# yado_df と結合
merged_df = log_df.merge(yado_df, on='yad_no', how='left')


# 各宿の人気度を計算（例：訪問回数で計算）
yad_popularity = merged_df['yad_no'].value_counts().reset_index()
yad_popularity.columns = ['yad_no', 'popularity']

# 全体での人気度ランキング
yad_popularity['overall_rank'] = yad_popularity['popularity'].rank(ascending=False)
# yado_df に人気度をマージ
yado_df = yado_df.merge(yad_popularity, on='yad_no', how='left')
# エリアごとの人気度ランキングを計算
for area in ['wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd']:
    yado_df[f'{area}_rank'] = yado_df.groupby(area)['popularity'].rank(ascending=False, method='min')

# 結果の確認
print(yado_df.head())

   yad_no  yad_type  total_room_cnt  wireless_lan_flg  onsen_flg  kd_stn_5min  \
0       1         0           129.0               1.0          0          1.0   
1       2         0            23.0               1.0          0          NaN   
2       3         0           167.0               1.0          1          1.0   
3       4         0           144.0               1.0          0          1.0   
4       5         0            41.0               1.0          1          NaN   

   kd_bch_5min  kd_slp_5min  kd_conv_walk_5min  \
0          NaN          NaN                1.0   
1          NaN          NaN                NaN   
2          NaN          NaN                1.0   
3          NaN          NaN                1.0   
4          NaN          NaN                NaN   

                             wid_cd                            ken_cd  \
0  f0112abf369fb03cdc5f5309300913da  072c85e1653e10c9c7dd065ad007125a   
1  d86102dd9c232bade9a97dccad40df48  b4d2fb4e51ea7bca80eb1270aa474

### seen_yado == 0のみに絞る

In [32]:
# train_dfとtest_dfからseen_yadが1の行のみ抽出
train_df = train_df[train_df['seen_yad'] == 0].reset_index(drop=True)
test_df = test_df[test_df['seen_yad'] == 0].reset_index(drop=True) 

In [36]:
train_df

Unnamed: 0,session_id,yado_no,target,predict_no,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag
0,000007603d533d30453cc45d0f3d119f,11882,0,0,-1,0,0,-1,-1,0
1,000007603d533d30453cc45d0f3d119f,2808,0,1,-1,0,0,-1,-1,0
2,000007603d533d30453cc45d0f3d119f,5289,0,2,-1,0,0,-1,-1,0
3,000007603d533d30453cc45d0f3d119f,4101,1,3,-1,0,0,-1,-1,0
4,000007603d533d30453cc45d0f3d119f,3324,0,4,-1,0,0,-1,-1,0
...,...,...,...,...,...,...,...,...,...,...
2777522,fffffa7baf370083ebcdd98f26a7e31a,10439,0,5,-1,0,1,-1,-1,0
2777523,fffffa7baf370083ebcdd98f26a7e31a,9624,0,6,-1,0,1,-1,-1,0
2777524,fffffa7baf370083ebcdd98f26a7e31a,10415,0,7,-1,0,1,-1,-1,0
2777525,fffffa7baf370083ebcdd98f26a7e31a,6579,0,8,-1,0,1,-1,-1,0


In [37]:
test_df

Unnamed: 0,session_id,yado_no,target,predict_no,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag
0,00001149e9c73985425197104712478c,11561,0,1,-1,0,1,-1,-1,0
1,00001149e9c73985425197104712478c,4714,0,2,-1,0,1,-1,-1,0
2,00001149e9c73985425197104712478c,2680,0,3,-1,0,1,-1,-1,0
3,00001149e9c73985425197104712478c,4420,0,4,-1,0,1,-1,-1,0
4,00001149e9c73985425197104712478c,5466,0,5,-1,0,1,-1,-1,0
...,...,...,...,...,...,...,...,...,...,...
1683599,ffffe984aafd6127ce8e43e3ca40c79d,5623,0,5,-1,0,0,-1,-1,0
1683600,ffffe984aafd6127ce8e43e3ca40c79d,3781,0,6,-1,0,0,-1,-1,0
1683601,ffffe984aafd6127ce8e43e3ca40c79d,11994,0,7,-1,0,0,-1,-1,0
1683602,ffffe984aafd6127ce8e43e3ca40c79d,634,0,8,-1,0,0,-1,-1,0


In [38]:
#　train_dfとtest_dfにyado_dfをマージ
train_df = pd.merge(train_df, yado_df.rename(columns={'yad_no': 'yado_no'}), on='yado_no', how='left')
test_df = pd.merge(test_df, yado_df.rename(columns={'yad_no': 'yado_no'}), on='yado_no', how='left')

In [39]:
train_df

Unnamed: 0,session_id,yado_no,target,predict_no,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd,popularity,overall_rank,wid_cd_rank,ken_cd_rank,lrg_cd_rank,sml_cd_rank
0,000007603d533d30453cc45d0f3d119f,11882,0,0,-1,0,0,-1,-1,0,0,113.0,1.0,0,1.0,,,1.0,dc414a17890cfc17d011d5038b88ca93,d78f53d0856617bc782f02c3280dfef2,4fd631b15116098340cdb099c86a5a40,4044dac1931ddaa5a967e09506d76343,23.0,7189.5,891.0,286.0,25.0,21.0
1,000007603d533d30453cc45d0f3d119f,2808,0,1,-1,0,0,-1,-1,0,0,128.0,1.0,0,1.0,,,,dc414a17890cfc17d011d5038b88ca93,d78f53d0856617bc782f02c3280dfef2,4fd631b15116098340cdb099c86a5a40,4044dac1931ddaa5a967e09506d76343,36.0,5594.5,687.0,229.0,17.0,17.0
2,000007603d533d30453cc45d0f3d119f,5289,0,2,-1,0,0,-1,-1,0,0,66.0,1.0,0,1.0,,,1.0,dc414a17890cfc17d011d5038b88ca93,d78f53d0856617bc782f02c3280dfef2,4fd631b15116098340cdb099c86a5a40,4044dac1931ddaa5a967e09506d76343,14.0,8690.5,1073.0,334.0,37.0,33.0
3,000007603d533d30453cc45d0f3d119f,4101,1,3,-1,0,0,-1,-1,0,0,39.0,,0,,,,1.0,dc414a17890cfc17d011d5038b88ca93,d78f53d0856617bc782f02c3280dfef2,4fd631b15116098340cdb099c86a5a40,4044dac1931ddaa5a967e09506d76343,24.0,7045.0,874.0,280.0,23.0,19.0
4,000007603d533d30453cc45d0f3d119f,3324,0,4,-1,0,0,-1,-1,0,0,53.0,1.0,0,,,,1.0,dc414a17890cfc17d011d5038b88ca93,d78f53d0856617bc782f02c3280dfef2,4fd631b15116098340cdb099c86a5a40,4044dac1931ddaa5a967e09506d76343,15.0,8495.0,1053.0,330.0,35.0,31.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2777522,fffffa7baf370083ebcdd98f26a7e31a,10439,0,5,-1,0,1,-1,-1,0,0,174.0,1.0,0,,,,1.0,46e33861f921c3e38b81998fbf283f01,572d60f0f5212aacda515ebf81fb0a3a,8a623b960557e87bd1f4edf71b6255be,ab9480fd72a44d51690ab16c4ad4d49c,63.0,3503.5,799.0,114.0,52.0,7.0
2777523,fffffa7baf370083ebcdd98f26a7e31a,9624,0,6,-1,0,1,-1,-1,0,0,42.0,1.0,0,,,,1.0,46e33861f921c3e38b81998fbf283f01,572d60f0f5212aacda515ebf81fb0a3a,8a623b960557e87bd1f4edf71b6255be,ab9480fd72a44d51690ab16c4ad4d49c,78.0,2747.5,657.0,96.0,49.0,6.0
2777524,fffffa7baf370083ebcdd98f26a7e31a,10415,0,7,-1,0,1,-1,-1,0,0,114.0,1.0,0,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,572d60f0f5212aacda515ebf81fb0a3a,8a623b960557e87bd1f4edf71b6255be,ab9480fd72a44d51690ab16c4ad4d49c,61.0,3635.0,832.0,124.0,55.0,9.0
2777525,fffffa7baf370083ebcdd98f26a7e31a,6579,0,8,-1,0,1,-1,-1,0,0,128.0,,0,,,,1.0,46e33861f921c3e38b81998fbf283f01,572d60f0f5212aacda515ebf81fb0a3a,8a623b960557e87bd1f4edf71b6255be,ab9480fd72a44d51690ab16c4ad4d49c,62.0,3566.0,814.0,118.0,53.0,8.0


#### 各種Encoding

In [40]:
# カテゴリ変数と数値変数の明確化
cat_cols = ['wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd']

# 欠損値を-1で埋める
train_df.fillna(-1, inplace=True)
test_df.fillna(-1, inplace=True)

In [41]:
# Label Encoding

# testにしか存在しないものはNullにするので、trainのみをencoderにする
for col in cat_cols:
    encoder = {c: i for i, c in enumerate(train_df[col].unique())}
    train_df[f'label_{col}'] = train_df[col].map(encoder)
    test_df[f'label_{col}'] = test_df[col].map(encoder)

In [42]:
# Count Encoding（trainとtestをマージした方が良い？）←　した
# 増やした方が良い？

cat_cols = ['wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd'] + ['yado_no']

# testにしか存在しないものはNullにするので、trainのみをencoderにする
for col in cat_cols:
    encoder = train_df[col].value_counts()
    train_df[f'count_{col}'] = train_df[col].map(encoder)
    test_df[f'count_{col}'] = test_df[col].map(encoder)

In [43]:
# Target encoding
# 元の列順が保持されないから

cat_cols = ['wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd'] + ['yado_no']

# GroupKFold の設定  StratifiedGrouoKFoldでもいいかもしれない
gkf = GroupKFold(n_splits=5)

# fold 列を初期化
train_df['fold'] = -1

# 各 Fold に対してインデックスを割り当て
for fold, (train_idx, val_idx) in enumerate(gkf.split(train_df, groups=train_df['session_id'])):
    # バリデーションセットのインデックスに Fold 番号を割り当て
    train_df.loc[val_idx, 'fold'] = fold
# 結果の確認
print(train_df.head())



# TargetEncodingが完了した検証用データセットを格納するリスト
encoded_dfs = []

# すべての分割についてのループ
for fold in range(5):

  # 学習用と検証用データセットに分割する
  df_train = train_df[train_df.fold != fold].reset_index(drop=True)
  df_valid = train_df[train_df.fold == fold].reset_index(drop=True)
  
  # すべてのカテゴリについてのループ
  for column in cat_cols:
    # カテゴリごとの目的変数の統計量についての辞書を作成
    stats = df_train.groupby(column)['target'].agg(['mean', 'var'])
    
    # 平均、分散、最大値、最小値に対する列を作成
    for stat in ['mean', 'var']:
      mapping_dict = dict(stats[stat])
      df_valid.loc[:, f'TE_{column}_{stat}'] = df_valid[column].map(mapping_dict)

  # リストに格納
  encoded_dfs.append(df_valid)

# 結合したデータセットを返す
encoded_df = pd.concat(encoded_dfs, axis=0)


                         session_id  yado_no  target  predict_no  seq_no  \
0  000007603d533d30453cc45d0f3d119f    11882       0           0      -1   
1  000007603d533d30453cc45d0f3d119f     2808       0           1      -1   
2  000007603d533d30453cc45d0f3d119f     5289       0           2      -1   
3  000007603d533d30453cc45d0f3d119f     4101       1           3      -1   
4  000007603d533d30453cc45d0f3d119f     3324       0           4      -1   

   seen_yad  max_seq_no  diff_seq_no  is_odd  multiple_visits_flag  yad_type  \
0         0           0           -1      -1                     0         0   
1         0           0           -1      -1                     0         0   
2         0           0           -1      -1                     0         0   
3         0           0           -1      -1                     0         0   
4         0           0           -1      -1                     0         0   

   total_room_cnt  wireless_lan_flg  onsen_flg  kd_stn_5min  k

In [44]:
train_df['fold'].value_counts()

0    555508
1    555508
4    555507
3    555502
2    555502
Name: fold, dtype: int64

In [45]:
encoded_df['fold'].value_counts()

0    555508
1    555508
4    555507
2    555502
3    555502
Name: fold, dtype: int64

In [46]:
train_df = encoded_df

In [47]:
# Testデータに対しても同様にTargetEncodingを行う
cat_cols = ['wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd'] + ['yado_no']

# すべてのカテゴリについてのループ
# train_df全体で計算する
for column in cat_cols:
    # カテゴリごとの目的変数の統計量についての辞書を作成
    stats = train_df.groupby(column)['target'].agg(['mean', 'var'])
    
    # 平均、分散、最大値、最小値に対する列を作成
    for stat in ['mean', 'var']:
        mapping_dict = dict(stats[stat])
        test_df.loc[:, f'TE_{column}_{stat}'] = test_df[column].map(mapping_dict)


In [48]:
test_df

Unnamed: 0,session_id,yado_no,target,predict_no,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd,popularity,overall_rank,wid_cd_rank,ken_cd_rank,lrg_cd_rank,sml_cd_rank,label_wid_cd,label_ken_cd,label_lrg_cd,label_sml_cd,count_wid_cd,count_ken_cd,count_lrg_cd,count_sml_cd,count_yado_no,TE_wid_cd_mean,TE_wid_cd_var,TE_ken_cd_mean,TE_ken_cd_var,TE_lrg_cd_mean,TE_lrg_cd_var,TE_sml_cd_mean,TE_sml_cd_var,TE_yado_no_mean,TE_yado_no_var
0,00001149e9c73985425197104712478c,11561,0,1,-1,0,1,-1,-1,0,0,195.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,195.0,446.0,138.0,59.0,5.0,1.0,3,5,86,133,667202,370677,21972,8813,798.0,0.032047,0.03102,0.029681,0.0288,0.033179,0.032079,0.032339,0.031296,0.032581,0.031559
1,00001149e9c73985425197104712478c,4714,0,2,-1,0,1,-1,-1,0,0,58.0,1.0,0,-1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,83.0,2535.5,612.0,355.0,24.0,11.0,3,5,86,133,667202,370677,21972,8813,368.0,0.032047,0.03102,0.029681,0.0288,0.033179,0.032079,0.032339,0.031296,0.027174,0.026508
2,00001149e9c73985425197104712478c,2680,0,3,-1,0,1,-1,-1,0,0,150.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,137.0,1056.0,282.0,140.0,11.0,4.0,3,5,86,133,667202,370677,21972,8813,758.0,0.032047,0.03102,0.029681,0.0288,0.033179,0.032079,0.032339,0.031296,0.042216,0.040488
3,00001149e9c73985425197104712478c,4420,0,4,-1,0,1,-1,-1,0,0,124.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,67.0,3274.0,760.0,444.0,29.0,16.0,3,5,86,133,667202,370677,21972,8813,259.0,0.032047,0.03102,0.029681,0.0288,0.033179,0.032079,0.032339,0.031296,0.034749,0.033672
4,00001149e9c73985425197104712478c,5466,0,5,-1,0,1,-1,-1,0,0,130.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,126.0,1266.5,334.0,174.0,14.0,5.0,3,5,86,133,667202,370677,21972,8813,717.0,0.032047,0.03102,0.029681,0.0288,0.033179,0.032079,0.032339,0.031296,0.033473,0.032398
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1683599,ffffe984aafd6127ce8e43e3ca40c79d,5623,0,5,-1,0,0,-1,-1,0,0,178.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,c9d5e891463e5389c42d16f987ed30bd,7cf2b4f31fb20747f89e58981e6d9fc1,74.0,2931.5,690.0,402.0,34.0,11.0,3,5,79,123,667202,370677,35187,13415,613.0,0.032047,0.03102,0.029681,0.0288,0.039276,0.037734,0.038241,0.036781,0.035889,0.034658
1683600,ffffe984aafd6127ce8e43e3ca40c79d,3781,0,6,-1,0,0,-1,-1,0,0,245.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,c9d5e891463e5389c42d16f987ed30bd,9ccc341413e935a914a1ded367b8f80e,183.0,514.0,149.0,65.0,2.0,1.0,3,5,79,122,667202,370677,35187,14504,939.0,0.032047,0.03102,0.029681,0.0288,0.039276,0.037734,0.040265,0.038646,0.028754,0.027957
1683601,ffffe984aafd6127ce8e43e3ca40c79d,11994,0,7,-1,0,0,-1,-1,0,0,334.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,c9d5e891463e5389c42d16f987ed30bd,372d41b6f39f1f523d0841fd9b84ae44,135.0,1096.5,295.0,147.0,8.0,3.0,3,5,79,220,667202,370677,35187,5028,491.0,0.032047,0.03102,0.029681,0.0288,0.039276,0.037734,0.034606,0.033415,0.034623,0.033493
1683602,ffffe984aafd6127ce8e43e3ca40c79d,634,0,8,-1,0,0,-1,-1,0,0,174.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,c9d5e891463e5389c42d16f987ed30bd,7cf2b4f31fb20747f89e58981e6d9fc1,62.0,3566.0,814.0,477.0,46.0,14.0,3,5,79,123,667202,370677,35187,13415,323.0,0.032047,0.03102,0.029681,0.0288,0.039276,0.037734,0.038241,0.036781,0.046440,0.044421


In [49]:
all_features = train_df.columns.to_list()
features= [x for x in all_features if x not in ('session_id', 'fold', 'target', 'wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd')]

In [50]:
train_df[features]

Unnamed: 0,yado_no,predict_no,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,popularity,overall_rank,wid_cd_rank,ken_cd_rank,lrg_cd_rank,sml_cd_rank,label_wid_cd,label_ken_cd,label_lrg_cd,label_sml_cd,count_wid_cd,count_ken_cd,count_lrg_cd,count_sml_cd,count_yado_no,TE_wid_cd_mean,TE_wid_cd_var,TE_ken_cd_mean,TE_ken_cd_var,TE_lrg_cd_mean,TE_lrg_cd_var,TE_sml_cd_mean,TE_sml_cd_var,TE_yado_no_mean,TE_yado_no_var
0,3894,1,-1,0,1,-1,-1,0,0,205.0,1.0,0,-1.0,-1.0,-1.0,1.0,212.0,364.0,40.0,3.0,1.0,1.0,4,6,8,11,335580,36215,13849,13529,881,0.034651,0.033451,0.046152,0.044024,0.038851,0.037345,0.037307,0.035919,0.037868,0.036485
1,7749,2,-1,0,1,-1,-1,0,0,76.0,1.0,0,-1.0,-1.0,-1.0,1.0,150.0,845.5,113.0,10.0,6.0,6.0,4,6,8,11,335580,36215,13849,13529,785,0.034651,0.033451,0.046152,0.044024,0.038851,0.037345,0.037307,0.035919,0.041860,0.040170
2,902,3,-1,0,1,-1,-1,0,0,240.0,1.0,0,-1.0,-1.0,-1.0,1.0,154.0,785.5,103.0,9.0,5.0,5.0,4,6,8,11,335580,36215,13849,13529,675,0.034651,0.033451,0.046152,0.044024,0.038851,0.037345,0.037307,0.035919,0.041742,0.040073
3,11380,4,-1,0,1,-1,-1,0,0,111.0,1.0,0,1.0,-1.0,-1.0,1.0,134.0,1114.0,143.0,16.0,12.0,12.0,4,6,8,11,335580,36215,13849,13529,671,0.034651,0.033451,0.046152,0.044024,0.038851,0.037345,0.037307,0.035919,0.032967,0.031939
4,5490,5,-1,0,1,-1,-1,0,0,116.0,1.0,0,-1.0,-1.0,-1.0,1.0,143.0,945.0,126.0,13.0,9.0,9.0,4,6,8,11,335580,36215,13849,13529,911,0.034651,0.033451,0.046152,0.044024,0.038851,0.037345,0.037307,0.035919,0.037037,0.035714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555502,4772,5,-1,0,2,-1,-1,0,1,-1.0,1.0,0,-1.0,-1.0,-1.0,-1.0,50.0,4353.0,509.0,63.0,25.0,7.0,4,24,148,227,335580,38742,11787,2856,209,0.034665,0.033464,0.040508,0.038868,0.037076,0.035705,0.041722,0.039998,0.017964,0.017748
555503,1482,6,-1,0,2,-1,-1,0,0,198.0,1.0,0,-1.0,-1.0,-1.0,-1.0,138.0,1039.0,136.0,18.0,11.0,5.0,4,24,148,227,335580,38742,11787,2856,338,0.034665,0.033464,0.040508,0.038868,0.037076,0.035705,0.041722,0.039998,0.018450,0.018177
555504,4116,7,-1,0,2,-1,-1,0,0,94.0,1.0,1,-1.0,-1.0,-1.0,-1.0,120.0,1408.0,193.0,27.0,15.0,9.0,4,24,148,228,335580,38742,11787,8931,555,0.034665,0.033464,0.040508,0.038868,0.037076,0.035705,0.035605,0.034342,0.022624,0.022163
555505,10613,8,-1,0,2,-1,-1,0,0,310.0,1.0,0,-1.0,-1.0,-1.0,-1.0,133.0,1133.5,146.0,21.0,12.0,6.0,4,24,148,227,335580,38742,11787,2856,343,0.034665,0.033464,0.040508,0.038868,0.037076,0.035705,0.041722,0.039998,0.014599,0.014438


In [51]:
test_df[features]

Unnamed: 0,yado_no,predict_no,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,popularity,overall_rank,wid_cd_rank,ken_cd_rank,lrg_cd_rank,sml_cd_rank,label_wid_cd,label_ken_cd,label_lrg_cd,label_sml_cd,count_wid_cd,count_ken_cd,count_lrg_cd,count_sml_cd,count_yado_no,TE_wid_cd_mean,TE_wid_cd_var,TE_ken_cd_mean,TE_ken_cd_var,TE_lrg_cd_mean,TE_lrg_cd_var,TE_sml_cd_mean,TE_sml_cd_var,TE_yado_no_mean,TE_yado_no_var
0,11561,1,-1,0,1,-1,-1,0,0,195.0,1.0,0,1.0,-1.0,-1.0,1.0,195.0,446.0,138.0,59.0,5.0,1.0,3,5,86,133,667202,370677,21972,8813,798.0,0.032047,0.03102,0.029681,0.0288,0.033179,0.032079,0.032339,0.031296,0.032581,0.031559
1,4714,2,-1,0,1,-1,-1,0,0,58.0,1.0,0,-1.0,-1.0,-1.0,1.0,83.0,2535.5,612.0,355.0,24.0,11.0,3,5,86,133,667202,370677,21972,8813,368.0,0.032047,0.03102,0.029681,0.0288,0.033179,0.032079,0.032339,0.031296,0.027174,0.026508
2,2680,3,-1,0,1,-1,-1,0,0,150.0,1.0,0,1.0,-1.0,-1.0,1.0,137.0,1056.0,282.0,140.0,11.0,4.0,3,5,86,133,667202,370677,21972,8813,758.0,0.032047,0.03102,0.029681,0.0288,0.033179,0.032079,0.032339,0.031296,0.042216,0.040488
3,4420,4,-1,0,1,-1,-1,0,0,124.0,1.0,0,1.0,-1.0,-1.0,1.0,67.0,3274.0,760.0,444.0,29.0,16.0,3,5,86,133,667202,370677,21972,8813,259.0,0.032047,0.03102,0.029681,0.0288,0.033179,0.032079,0.032339,0.031296,0.034749,0.033672
4,5466,5,-1,0,1,-1,-1,0,0,130.0,1.0,0,1.0,-1.0,-1.0,1.0,126.0,1266.5,334.0,174.0,14.0,5.0,3,5,86,133,667202,370677,21972,8813,717.0,0.032047,0.03102,0.029681,0.0288,0.033179,0.032079,0.032339,0.031296,0.033473,0.032398
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1683599,5623,5,-1,0,0,-1,-1,0,0,178.0,1.0,0,1.0,-1.0,-1.0,1.0,74.0,2931.5,690.0,402.0,34.0,11.0,3,5,79,123,667202,370677,35187,13415,613.0,0.032047,0.03102,0.029681,0.0288,0.039276,0.037734,0.038241,0.036781,0.035889,0.034658
1683600,3781,6,-1,0,0,-1,-1,0,0,245.0,1.0,0,1.0,-1.0,-1.0,1.0,183.0,514.0,149.0,65.0,2.0,1.0,3,5,79,122,667202,370677,35187,14504,939.0,0.032047,0.03102,0.029681,0.0288,0.039276,0.037734,0.040265,0.038646,0.028754,0.027957
1683601,11994,7,-1,0,0,-1,-1,0,0,334.0,1.0,0,1.0,-1.0,-1.0,1.0,135.0,1096.5,295.0,147.0,8.0,3.0,3,5,79,220,667202,370677,35187,5028,491.0,0.032047,0.03102,0.029681,0.0288,0.039276,0.037734,0.034606,0.033415,0.034623,0.033493
1683602,634,8,-1,0,0,-1,-1,0,0,174.0,1.0,0,1.0,-1.0,-1.0,1.0,62.0,3566.0,814.0,477.0,46.0,14.0,3,5,79,123,667202,370677,35187,13415,323.0,0.032047,0.03102,0.029681,0.0288,0.039276,0.037734,0.038241,0.036781,0.046440,0.044421


In [42]:
train_df.columns.to_list()

['session_id',
 'yado_no',
 'target',
 'seq_no',
 'seen_yad',
 'max_seq_no',
 'diff_seq_no',
 'is_odd',
 'multiple_visits_flag',
 'yad_type',
 'total_room_cnt',
 'wireless_lan_flg',
 'onsen_flg',
 'kd_stn_5min',
 'kd_bch_5min',
 'kd_slp_5min',
 'kd_conv_walk_5min',
 'wid_cd',
 'ken_cd',
 'lrg_cd',
 'sml_cd',
 'popularity',
 'overall_rank',
 'wid_cd_rank',
 'ken_cd_rank',
 'lrg_cd_rank',
 'sml_cd_rank',
 'label_wid_cd',
 'label_ken_cd',
 'label_lrg_cd',
 'label_sml_cd',
 'count_wid_cd',
 'count_ken_cd',
 'count_lrg_cd',
 'count_sml_cd',
 'count_yado_no',
 'fold',
 'TE_wid_cd_mean',
 'TE_wid_cd_var',
 'TE_ken_cd_mean',
 'TE_ken_cd_var',
 'TE_lrg_cd_mean',
 'TE_lrg_cd_var',
 'TE_sml_cd_mean',
 'TE_sml_cd_var',
 'TE_yado_no_mean',
 'TE_yado_no_var']

In [43]:
import gc
gc.collect()

105

In [52]:
test_df

Unnamed: 0,session_id,yado_no,target,predict_no,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd,popularity,overall_rank,wid_cd_rank,ken_cd_rank,lrg_cd_rank,sml_cd_rank,label_wid_cd,label_ken_cd,label_lrg_cd,label_sml_cd,count_wid_cd,count_ken_cd,count_lrg_cd,count_sml_cd,count_yado_no,TE_wid_cd_mean,TE_wid_cd_var,TE_ken_cd_mean,TE_ken_cd_var,TE_lrg_cd_mean,TE_lrg_cd_var,TE_sml_cd_mean,TE_sml_cd_var,TE_yado_no_mean,TE_yado_no_var
0,00001149e9c73985425197104712478c,11561,0,1,-1,0,1,-1,-1,0,0,195.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,195.0,446.0,138.0,59.0,5.0,1.0,3,5,86,133,667202,370677,21972,8813,798.0,0.032047,0.03102,0.029681,0.0288,0.033179,0.032079,0.032339,0.031296,0.032581,0.031559
1,00001149e9c73985425197104712478c,4714,0,2,-1,0,1,-1,-1,0,0,58.0,1.0,0,-1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,83.0,2535.5,612.0,355.0,24.0,11.0,3,5,86,133,667202,370677,21972,8813,368.0,0.032047,0.03102,0.029681,0.0288,0.033179,0.032079,0.032339,0.031296,0.027174,0.026508
2,00001149e9c73985425197104712478c,2680,0,3,-1,0,1,-1,-1,0,0,150.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,137.0,1056.0,282.0,140.0,11.0,4.0,3,5,86,133,667202,370677,21972,8813,758.0,0.032047,0.03102,0.029681,0.0288,0.033179,0.032079,0.032339,0.031296,0.042216,0.040488
3,00001149e9c73985425197104712478c,4420,0,4,-1,0,1,-1,-1,0,0,124.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,67.0,3274.0,760.0,444.0,29.0,16.0,3,5,86,133,667202,370677,21972,8813,259.0,0.032047,0.03102,0.029681,0.0288,0.033179,0.032079,0.032339,0.031296,0.034749,0.033672
4,00001149e9c73985425197104712478c,5466,0,5,-1,0,1,-1,-1,0,0,130.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,126.0,1266.5,334.0,174.0,14.0,5.0,3,5,86,133,667202,370677,21972,8813,717.0,0.032047,0.03102,0.029681,0.0288,0.033179,0.032079,0.032339,0.031296,0.033473,0.032398
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1683599,ffffe984aafd6127ce8e43e3ca40c79d,5623,0,5,-1,0,0,-1,-1,0,0,178.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,c9d5e891463e5389c42d16f987ed30bd,7cf2b4f31fb20747f89e58981e6d9fc1,74.0,2931.5,690.0,402.0,34.0,11.0,3,5,79,123,667202,370677,35187,13415,613.0,0.032047,0.03102,0.029681,0.0288,0.039276,0.037734,0.038241,0.036781,0.035889,0.034658
1683600,ffffe984aafd6127ce8e43e3ca40c79d,3781,0,6,-1,0,0,-1,-1,0,0,245.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,c9d5e891463e5389c42d16f987ed30bd,9ccc341413e935a914a1ded367b8f80e,183.0,514.0,149.0,65.0,2.0,1.0,3,5,79,122,667202,370677,35187,14504,939.0,0.032047,0.03102,0.029681,0.0288,0.039276,0.037734,0.040265,0.038646,0.028754,0.027957
1683601,ffffe984aafd6127ce8e43e3ca40c79d,11994,0,7,-1,0,0,-1,-1,0,0,334.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,c9d5e891463e5389c42d16f987ed30bd,372d41b6f39f1f523d0841fd9b84ae44,135.0,1096.5,295.0,147.0,8.0,3.0,3,5,79,220,667202,370677,35187,5028,491.0,0.032047,0.03102,0.029681,0.0288,0.039276,0.037734,0.034606,0.033415,0.034623,0.033493
1683602,ffffe984aafd6127ce8e43e3ca40c79d,634,0,8,-1,0,0,-1,-1,0,0,174.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,c9d5e891463e5389c42d16f987ed30bd,7cf2b4f31fb20747f89e58981e6d9fc1,62.0,3566.0,814.0,477.0,46.0,14.0,3,5,79,123,667202,370677,35187,13415,323.0,0.032047,0.03102,0.029681,0.0288,0.039276,0.037734,0.038241,0.036781,0.046440,0.044421


In [53]:
train_df

Unnamed: 0,session_id,yado_no,target,predict_no,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd,popularity,overall_rank,wid_cd_rank,ken_cd_rank,lrg_cd_rank,sml_cd_rank,label_wid_cd,label_ken_cd,label_lrg_cd,label_sml_cd,count_wid_cd,count_ken_cd,count_lrg_cd,count_sml_cd,count_yado_no,fold,TE_wid_cd_mean,TE_wid_cd_var,TE_ken_cd_mean,TE_ken_cd_var,TE_lrg_cd_mean,TE_lrg_cd_var,TE_sml_cd_mean,TE_sml_cd_var,TE_yado_no_mean,TE_yado_no_var
0,000104bdffaaad1a1e0a9ebacf585f33,3894,0,1,-1,0,1,-1,-1,0,0,205.0,1.0,0,-1.0,-1.0,-1.0,1.0,e9316013ee1b03f4525fe361c46ce9c5,84efa50e52f9b471c95bfc3b21b854ad,a1370d90ed3b80ee41311bbbab46aec9,d72674f02c5340d90f245e3177727650,212.0,364.0,40.0,3.0,1.0,1.0,4,6,8,11,335580,36215,13849,13529,881,0,0.034651,0.033451,0.046152,0.044024,0.038851,0.037345,0.037307,0.035919,0.037868,0.036485
1,000104bdffaaad1a1e0a9ebacf585f33,7749,0,2,-1,0,1,-1,-1,0,0,76.0,1.0,0,-1.0,-1.0,-1.0,1.0,e9316013ee1b03f4525fe361c46ce9c5,84efa50e52f9b471c95bfc3b21b854ad,a1370d90ed3b80ee41311bbbab46aec9,d72674f02c5340d90f245e3177727650,150.0,845.5,113.0,10.0,6.0,6.0,4,6,8,11,335580,36215,13849,13529,785,0,0.034651,0.033451,0.046152,0.044024,0.038851,0.037345,0.037307,0.035919,0.041860,0.040170
2,000104bdffaaad1a1e0a9ebacf585f33,902,0,3,-1,0,1,-1,-1,0,0,240.0,1.0,0,-1.0,-1.0,-1.0,1.0,e9316013ee1b03f4525fe361c46ce9c5,84efa50e52f9b471c95bfc3b21b854ad,a1370d90ed3b80ee41311bbbab46aec9,d72674f02c5340d90f245e3177727650,154.0,785.5,103.0,9.0,5.0,5.0,4,6,8,11,335580,36215,13849,13529,675,0,0.034651,0.033451,0.046152,0.044024,0.038851,0.037345,0.037307,0.035919,0.041742,0.040073
3,000104bdffaaad1a1e0a9ebacf585f33,11380,0,4,-1,0,1,-1,-1,0,0,111.0,1.0,0,1.0,-1.0,-1.0,1.0,e9316013ee1b03f4525fe361c46ce9c5,84efa50e52f9b471c95bfc3b21b854ad,a1370d90ed3b80ee41311bbbab46aec9,d72674f02c5340d90f245e3177727650,134.0,1114.0,143.0,16.0,12.0,12.0,4,6,8,11,335580,36215,13849,13529,671,0,0.034651,0.033451,0.046152,0.044024,0.038851,0.037345,0.037307,0.035919,0.032967,0.031939
4,000104bdffaaad1a1e0a9ebacf585f33,5490,0,5,-1,0,1,-1,-1,0,0,116.0,1.0,0,-1.0,-1.0,-1.0,1.0,e9316013ee1b03f4525fe361c46ce9c5,84efa50e52f9b471c95bfc3b21b854ad,a1370d90ed3b80ee41311bbbab46aec9,d72674f02c5340d90f245e3177727650,143.0,945.0,126.0,13.0,9.0,9.0,4,6,8,11,335580,36215,13849,13529,911,0,0.034651,0.033451,0.046152,0.044024,0.038851,0.037345,0.037307,0.035919,0.037037,0.035714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555502,fffe8a472ae6a96c9da05a30ac3ed6c5,4772,0,5,-1,0,2,-1,-1,0,1,-1.0,1.0,0,-1.0,-1.0,-1.0,-1.0,e9316013ee1b03f4525fe361c46ce9c5,517061b8165aa6370d9025951a64aa52,7e5ebb4d5e3cdfd3ad3798c9864d87d6,e5680545edc53d20bb05168e67c9f9f0,50.0,4353.0,509.0,63.0,25.0,7.0,4,24,148,227,335580,38742,11787,2856,209,4,0.034665,0.033464,0.040508,0.038868,0.037076,0.035705,0.041722,0.039998,0.017964,0.017748
555503,fffe8a472ae6a96c9da05a30ac3ed6c5,1482,0,6,-1,0,2,-1,-1,0,0,198.0,1.0,0,-1.0,-1.0,-1.0,-1.0,e9316013ee1b03f4525fe361c46ce9c5,517061b8165aa6370d9025951a64aa52,7e5ebb4d5e3cdfd3ad3798c9864d87d6,e5680545edc53d20bb05168e67c9f9f0,138.0,1039.0,136.0,18.0,11.0,5.0,4,24,148,227,335580,38742,11787,2856,338,4,0.034665,0.033464,0.040508,0.038868,0.037076,0.035705,0.041722,0.039998,0.018450,0.018177
555504,fffe8a472ae6a96c9da05a30ac3ed6c5,4116,0,7,-1,0,2,-1,-1,0,0,94.0,1.0,1,-1.0,-1.0,-1.0,-1.0,e9316013ee1b03f4525fe361c46ce9c5,517061b8165aa6370d9025951a64aa52,7e5ebb4d5e3cdfd3ad3798c9864d87d6,840f2157ec5bb4f5501a3ace2f4ef8d1,120.0,1408.0,193.0,27.0,15.0,9.0,4,24,148,228,335580,38742,11787,8931,555,4,0.034665,0.033464,0.040508,0.038868,0.037076,0.035705,0.035605,0.034342,0.022624,0.022163
555505,fffe8a472ae6a96c9da05a30ac3ed6c5,10613,0,8,-1,0,2,-1,-1,0,0,310.0,1.0,0,-1.0,-1.0,-1.0,-1.0,e9316013ee1b03f4525fe361c46ce9c5,517061b8165aa6370d9025951a64aa52,7e5ebb4d5e3cdfd3ad3798c9864d87d6,e5680545edc53d20bb05168e67c9f9f0,133.0,1133.5,146.0,21.0,12.0,6.0,4,24,148,227,335580,38742,11787,2856,343,4,0.034665,0.033464,0.040508,0.038868,0.037076,0.035705,0.041722,0.039998,0.014599,0.014438


In [54]:
print(train_df['seen_yad'].value_counts())
print(test_df['seen_yad'].value_counts())

0    2777527
Name: seen_yad, dtype: int64
0    1683604
Name: seen_yad, dtype: int64


In [55]:
train_df.to_parquet('../data/feature_engineering_v8_train_df.parquet', index=False)
test_df.to_parquet('../data/feature_engineering_v8_test_df.parquet', index=False)