In [54]:
import numpy as np
import pandas as pd
from tqdm import tqdm


from sklearn.model_selection import KFold
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import StratifiedKFold
from nyaggle.feature.category_encoder import TargetEncoder

from nyaggle.experiment import run_experiment

#最大表示列数の指定（ここでは50列を指定）
pd.set_option('display.max_columns', 50)

In [55]:
train_log_df = pd.read_csv('../train_log.csv')
test_log_df = pd.read_csv('../test_log.csv')
train_label_df = pd.read_csv('../train_label.csv')
test_session_df = pd.read_csv('../test_session.csv')
yado_df = pd.read_csv('../yado.csv')
# image_embeddings_df = pd.read_parquet('../image_embeddings.parquet')
sample_submission_df = pd.read_csv('../sample_submission.csv')

candidate_train_df = pd.read_csv('../data/candidate_ver8_train.csv')
candidate_test_df = pd.read_csv('../data/candidate_ver8_test.csv')

#### Trainに正例と負例のフラグを付与

In [56]:
# 学習用データ
# 教師データに正例と負例のフラグを付与
train_target = []

# 各セッションに対して候補の宿をチェック
for index, row in tqdm(train_label_df.iterrows(), total=train_label_df.shape[0]):
    session_id = row['session_id']
    yad_no = row['yad_no']
    
    # candidate_ver7_train.csv の該当行を取得（同じインデックスを仮定）
    candidate_row = candidate_train_df.iloc[index]

    # 50個の購入候補の宿に対してチェック
    for col in candidate_train_df.columns:
        yado_no = candidate_row[col]
        is_correct = (yado_no == yad_no)  # 一致するかどうかのチェック
        train_target.append([session_id, yado_no, is_correct])

# 結果をDataFrameに変換
train_df = pd.DataFrame(train_target, columns=['session_id', 'yado_no', 'target']) #区別するためにあえてyad_noではなくyado_noとしている
train_df['target'] = train_df['target'].astype(int)  # target列をint型に変換

# 最初の数行を表示して確認
print(train_df.head())

print('正例と負例の数を確認')
print(train_df['target'].value_counts())

100%|██████████| 288698/288698 [00:41<00:00, 6879.17it/s]


                         session_id  yado_no  target
0  000007603d533d30453cc45d0f3d119f    11882       0
1  000007603d533d30453cc45d0f3d119f     2808       0
2  000007603d533d30453cc45d0f3d119f     5289       0
3  000007603d533d30453cc45d0f3d119f     4101       1
4  000007603d533d30453cc45d0f3d119f     3324       0
正例と負例の数を確認
0    2697058
1     189922
Name: target, dtype: int64


#### 推論用データの作成

In [57]:
# 推論用データにダミーのyado_noを付与
test_session_df['yad_no'] = -1

test_target = []

# 各セッションに対して候補の宿をチェック
for index, row in tqdm(test_session_df.iterrows(), total=test_session_df.shape[0]):
    session_id = row['session_id']
    yad_no = row['yad_no']
    
    # candidate_ver7_test.csv の該当行を取得（同じインデックスを仮定）
    candidate_row = candidate_test_df.iloc[index]

    # 50個の購入候補の宿に対してチェック
    for col in candidate_test_df.columns:
        yado_no = candidate_row[col]
        is_correct = (yado_no == yad_no)  # 一致するかどうかのチェック
        test_target.append([session_id, yado_no, is_correct])

# 結果をDataFrameに変換
test_df = pd.DataFrame(test_target, columns=['session_id', 'yado_no', 'target']) #区別するためにあえてyad_noではなくyado_noとしている
test_df['target'] = test_df['target'].astype(int)  # target列をint型に変換

# 最初の数行を表示して確認
print(test_df.head())

print('正例と負例の数を確認')
print(test_df['target'].value_counts()) # すべて0になるはず

100%|██████████| 174700/174700 [00:25<00:00, 6784.15it/s]


                         session_id  yado_no  target
0  00001149e9c73985425197104712478c     3560       0
1  00001149e9c73985425197104712478c     4545       0
2  00001149e9c73985425197104712478c     9534       0
3  00001149e9c73985425197104712478c     5785       0
4  00001149e9c73985425197104712478c     6563       0
正例と負例の数を確認
0    1747000
Name: target, dtype: int64


#### seq_noを追加

In [58]:
# seq_noをマージする

# train_log の session_id と yad_no の組み合わせに基づいて辞書を作成
seq_no_dict_train = dict(zip(zip(train_log_df['session_id'], train_log_df['yad_no']), train_log_df['seq_no']))
# train_df に seq_no 列を追加（tqdm で進捗表示）
train_df['seq_no'] = [seq_no_dict_train.get((row['session_id'], row['yado_no']), -1) for row in tqdm(train_df.to_dict('records'))]
# 結果の確認
print(train_df.head())


# test_log の session_id と yad_no の組み合わせに基づいて辞書を作成
seq_no_dict_test = dict(zip(zip(test_log_df['session_id'], test_log_df['yad_no']), test_log_df['seq_no']))
# test_df に seq_no 列を追加（tqdm で進捗表示）
test_df['seq_no'] = [seq_no_dict_test.get((row['session_id'], row['yado_no']), -1) for row in tqdm(test_df.to_dict('records'))]
# 結果の確認
print(test_df.head())

100%|██████████| 2886980/2886980 [00:01<00:00, 2349480.37it/s]


                         session_id  yado_no  target  seq_no
0  000007603d533d30453cc45d0f3d119f    11882       0      -1
1  000007603d533d30453cc45d0f3d119f     2808       0      -1
2  000007603d533d30453cc45d0f3d119f     5289       0      -1
3  000007603d533d30453cc45d0f3d119f     4101       1      -1
4  000007603d533d30453cc45d0f3d119f     3324       0      -1


100%|██████████| 1747000/1747000 [00:00<00:00, 2507869.88it/s]


                         session_id  yado_no  target  seq_no
0  00001149e9c73985425197104712478c     3560       0       0
1  00001149e9c73985425197104712478c     4545       0      -1
2  00001149e9c73985425197104712478c     9534       0      -1
3  00001149e9c73985425197104712478c     5785       0      -1
4  00001149e9c73985425197104712478c     6563       0      -1


In [59]:
#　※複数のseq_noがある場合は、0も混入する
print(train_df['seq_no'].value_counts())
print(test_df['seq_no'].value_counts())

-1    2777527
 0      88128
 1      16052
 2       4125
 3        837
 4        223
 5         65
 6         18
 7          4
 8          1
Name: seq_no, dtype: int64
-1    1683604
 0      51884
 1       8692
 2       2239
 3        428
 4        124
 5         22
 6          7
Name: seq_no, dtype: int64


#### 候補の宿がsession中に閲覧されていたかどうか

In [60]:
# 'seq_no'が−1でなければ1のフラグを立てる
train_df['seen_yad'] = train_df['seq_no'].apply(lambda x: 1 if x != -1 else 0)
test_df['seen_yad'] = test_df['seq_no'].apply(lambda x: 1 if x != -1 else 0)

In [61]:
print(train_df['seen_yad'].value_counts())  
print(test_df['seen_yad'].value_counts())

0    2777527
1     109453
Name: seen_yad, dtype: int64
0    1683604
1      63396
Name: seen_yad, dtype: int64


#### 各sessionにおける最大seq_no

In [62]:
# 各セッションの seq_no の最大値を計算
max_seq_no_train = train_log_df.groupby('session_id')['seq_no'].max().reset_index()
max_seq_no_train.rename(columns={'seq_no': 'max_seq_no'}, inplace=True)
# teacher_df と max_seq_no のマージ
train_df = train_df.merge(max_seq_no_train, on='session_id', how='left')
# 結果の確認
print(train_df.head())


# 各セッションの seq_no の最大値を計算
max_seq_no_test = test_log_df.groupby('session_id')['seq_no'].max().reset_index()
max_seq_no_test.rename(columns={'seq_no': 'max_seq_no'}, inplace=True)
# teacher_df と max_seq_no のマージ
test_df = test_df.merge(max_seq_no_test, on='session_id', how='left')
# 結果の確認
print(test_df.head())

                         session_id  yado_no  target  seq_no  seen_yad  \
0  000007603d533d30453cc45d0f3d119f    11882       0      -1         0   
1  000007603d533d30453cc45d0f3d119f     2808       0      -1         0   
2  000007603d533d30453cc45d0f3d119f     5289       0      -1         0   
3  000007603d533d30453cc45d0f3d119f     4101       1      -1         0   
4  000007603d533d30453cc45d0f3d119f     3324       0      -1         0   

   max_seq_no  
0           0  
1           0  
2           0  
3           0  
4           0  
                         session_id  yado_no  target  seq_no  seen_yad  \
0  00001149e9c73985425197104712478c     3560       0       0         1   
1  00001149e9c73985425197104712478c     4545       0      -1         0   
2  00001149e9c73985425197104712478c     9534       0      -1         0   
3  00001149e9c73985425197104712478c     5785       0      -1         0   
4  00001149e9c73985425197104712478c     6563       0      -1         0   

   max_seq_no 

In [63]:
print(train_df['max_seq_no'].value_counts())
print(test_df['max_seq_no'].value_counts())

0    1853860
1     827930
2     153500
3      40250
4       8330
5       2230
6        650
7        180
8         40
9         10
Name: max_seq_no, dtype: int64
0    1139400
1     494930
2      84590
3      22270
4       4280
5       1240
6        220
7         70
Name: max_seq_no, dtype: int64


#### 差分の考慮：max_seq_noから(-1以外の要素)でseq_noを引く

In [64]:
# 最初に全ての diff_seq_no を -1 に設定
train_df['diff_seq_no'] = -1
test_df['diff_seq_no'] = -1

# seq_no が -1 以外の行にのみ max_seq_no - seq_no の計算を適用
train_df.loc[train_df['seq_no'] != -1, 'diff_seq_no'] = train_df['max_seq_no'] - train_df['seq_no']
test_df.loc[test_df['seq_no'] != -1, 'diff_seq_no'] = test_df['max_seq_no'] - test_df['seq_no']

# 結果の確認
print(train_df.head())
print(test_df.head())

                         session_id  yado_no  target  seq_no  seen_yad  \
0  000007603d533d30453cc45d0f3d119f    11882       0      -1         0   
1  000007603d533d30453cc45d0f3d119f     2808       0      -1         0   
2  000007603d533d30453cc45d0f3d119f     5289       0      -1         0   
3  000007603d533d30453cc45d0f3d119f     4101       1      -1         0   
4  000007603d533d30453cc45d0f3d119f     3324       0      -1         0   

   max_seq_no  diff_seq_no  
0           0           -1  
1           0           -1  
2           0           -1  
3           0           -1  
4           0           -1  
                         session_id  yado_no  target  seq_no  seen_yad  \
0  00001149e9c73985425197104712478c     3560       0       0         1   
1  00001149e9c73985425197104712478c     4545       0      -1         0   
2  00001149e9c73985425197104712478c     9534       0      -1         0   
3  00001149e9c73985425197104712478c     5785       0      -1         0   
4  00001149

In [65]:
print(train_df['diff_seq_no'].value_counts())
print(test_df['diff_seq_no'].value_counts())

-1    2777527
 1     103312
 2       5684
 3        407
 4         49
 5          1
Name: diff_seq_no, dtype: int64
-1    1683604
 1      60760
 2       2528
 3        106
 4          2
Name: diff_seq_no, dtype: int64


#### diff_seq_no が奇数かどうかの判定 

In [66]:
# diff_seq_no が奇数かどうかの判定（seq_no が -1 の場合は除外）
train_df['is_odd'] = np.where(train_df['seq_no'] != -1, train_df['diff_seq_no'] % 2 == 1, -1)
test_df['is_odd'] = np.where(test_df['seq_no'] != -1, test_df['diff_seq_no'] % 2 == 1, -1)

# seq_no が -1 以外の場合、Trueを0、Falseを1に変換（すでに -1 の場合は変更しない）
train_df['is_odd'] = np.where(train_df['is_odd'] != -1, np.where(train_df['is_odd'], 0, 1), -1)
test_df['is_odd'] = np.where(test_df['is_odd'] != -1, np.where(test_df['is_odd'], 0, 1), -1)

# 結果の確認
print(train_df.head())
print(test_df.head())

                         session_id  yado_no  target  seq_no  seen_yad  \
0  000007603d533d30453cc45d0f3d119f    11882       0      -1         0   
1  000007603d533d30453cc45d0f3d119f     2808       0      -1         0   
2  000007603d533d30453cc45d0f3d119f     5289       0      -1         0   
3  000007603d533d30453cc45d0f3d119f     4101       1      -1         0   
4  000007603d533d30453cc45d0f3d119f     3324       0      -1         0   

   max_seq_no  diff_seq_no  is_odd  
0           0           -1      -1  
1           0           -1      -1  
2           0           -1      -1  
3           0           -1      -1  
4           0           -1      -1  
                         session_id  yado_no  target  seq_no  seen_yad  \
0  00001149e9c73985425197104712478c     3560       0       0         1   
1  00001149e9c73985425197104712478c     4545       0      -1         0   
2  00001149e9c73985425197104712478c     9534       0      -1         0   
3  00001149e9c73985425197104712478c  

In [67]:
print(train_df['is_odd'].value_counts())
print(test_df['is_odd'].value_counts())

-1    2777527
 0     103720
 1       5733
Name: is_odd, dtype: int64
-1    1683604
 0      60866
 1       2530
Name: is_odd, dtype: int64


#### 各sessionにおいて2回以上出現したyad_noがあれば1のフラグを立てる

In [68]:
# 各セッションにおいて2回以上登場する yad_no を特定
yad_no_counts = train_log_df.groupby(['session_id', 'yad_no']).size().reset_index(name='count')
multiple_visits = yad_no_counts[yad_no_counts['count'] > 1]

# (session_id, yad_no) のタプルをキーとして辞書を作成
multiple_visits_dict = {(row['session_id'], row['yad_no']): 1 for _, row in multiple_visits.iterrows()}

# teacher_df に multiple_visits_flag 列を追加（tqdm で進捗表示）
train_df['multiple_visits_flag'] = [multiple_visits_dict.get((row['session_id'], row['yado_no']), 0) 
                                      for row in tqdm(train_df.to_dict('records'))]

# 結果の確認
print(train_df.head())



# 各セッションにおいて2回以上登場する yad_no を特定
yad_no_counts = test_log_df.groupby(['session_id', 'yad_no']).size().reset_index(name='count')
multiple_visits = yad_no_counts[yad_no_counts['count'] > 1]

# (session_id, yad_no) のタプルをキーとして辞書を作成
multiple_visits_dict = {(row['session_id'], row['yad_no']): 1 for _, row in multiple_visits.iterrows()}

# teacher_df に multiple_visits_flag 列を追加（tqdm で進捗表示）
test_df['multiple_visits_flag'] = [multiple_visits_dict.get((row['session_id'], row['yado_no']), 0) 
                                      for row in tqdm(test_df.to_dict('records'))]

# 結果の確認
print(test_df.head())

100%|██████████| 2886980/2886980 [00:00<00:00, 3032963.13it/s]


                         session_id  yado_no  target  seq_no  seen_yad  \
0  000007603d533d30453cc45d0f3d119f    11882       0      -1         0   
1  000007603d533d30453cc45d0f3d119f     2808       0      -1         0   
2  000007603d533d30453cc45d0f3d119f     5289       0      -1         0   
3  000007603d533d30453cc45d0f3d119f     4101       1      -1         0   
4  000007603d533d30453cc45d0f3d119f     3324       0      -1         0   

   max_seq_no  diff_seq_no  is_odd  multiple_visits_flag  
0           0           -1      -1                     0  
1           0           -1      -1                     0  
2           0           -1      -1                     0  
3           0           -1      -1                     0  
4           0           -1      -1                     0  


100%|██████████| 1747000/1747000 [00:00<00:00, 3210198.88it/s]


                         session_id  yado_no  target  seq_no  seen_yad  \
0  00001149e9c73985425197104712478c     3560       0       0         1   
1  00001149e9c73985425197104712478c     4545       0      -1         0   
2  00001149e9c73985425197104712478c     9534       0      -1         0   
3  00001149e9c73985425197104712478c     5785       0      -1         0   
4  00001149e9c73985425197104712478c     6563       0      -1         0   

   max_seq_no  diff_seq_no  is_odd  multiple_visits_flag  
0           1            1       0                     0  
1           1           -1      -1                     0  
2           1           -1      -1                     0  
3           1           -1      -1                     0  
4           1           -1      -1                     0  


In [69]:
print(train_df['multiple_visits_flag'].value_counts())
print(test_df['multiple_visits_flag'].value_counts())

0    2882332
1       4648
Name: multiple_visits_flag, dtype: int64
0    1744359
1       2641
Name: multiple_visits_flag, dtype: int64


### yado_dfに関する特徴量エンジニアリング

In [70]:
yado_df

Unnamed: 0,yad_no,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd
0,1,0,129.0,1.0,0,1.0,,,1.0,f0112abf369fb03cdc5f5309300913da,072c85e1653e10c9c7dd065ad007125a,449c52ef581d5f9ef311189469a0520e,677a32689cd1ad74e867f1fbe43a3e1c
1,2,0,23.0,1.0,0,,,,,d86102dd9c232bade9a97dccad40df48,b4d2fb4e51ea7bca80eb1270aa474a54,5c9a8f48e9df0234da012747a02d4b29,4ee16ee838dd2703cc9a1d5a535f0ced
2,3,0,167.0,1.0,1,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,572d60f0f5212aacda515ebf81fb0a3a,8a623b960557e87bd1f4edf71b6255be,ab9480fd72a44d51690ab16c4ad4d49c
3,4,0,144.0,1.0,0,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52c9ea83f2cfe92be54cb6bc961edf21,1cc3e1838bb0fd0fde0396130b1f82b9
4,5,0,41.0,1.0,1,,,,,43875109d1dab93592812c50d18270a7,75617bb07a2785a948ab1958909211f1,9ea5a911019b66ccd42f556c42a2fe2f,be1b876af18afc4deeb3081591d2a910
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13801,13802,0,10.0,1.0,1,,,,,c312e07b7a5d456d53a5b00910a336e1,558ac1909f0318b82c621ab250329d6d,80fb3c5ad0c89931d0923e9f80885218,5eb30820716082c720836733d73c605e
13802,13803,0,,,0,1.0,,,1.0,dc414a17890cfc17d011d5038b88ca93,d78f53d0856617bc782f02c3280dfef2,e5cfcc0a43c82072aca11628ff0add53,20ad8785a30f125bee5a8a325782ab06
13803,13804,0,80.0,1.0,1,,1.0,,1.0,d86102dd9c232bade9a97dccad40df48,7d76599bd27ff9e7823b2b1323ca763e,c5fe8848b6ab39b040cdb3668aea9433,b3eab50ccf6ffb51c37d36ee384abfbf
13804,13805,0,8.0,1.0,1,,,,1.0,3300cf6f774b7c6a5807110f244cbc21,689cf8289e7ea0b2eef1b017dcdfe8de,8b712435430a6875839a6c3b5a40b008,2b4165444a777465576b25f65697d739


In [71]:
# train_log_df と test_log_df を結合
log_df = pd.concat([train_log_df, test_log_df], axis=0, ignore_index=True)
# yado_df と結合
merged_df = log_df.merge(yado_df, on='yad_no', how='left')


# 各宿の人気度を計算（例：訪問回数で計算）
yad_popularity = merged_df['yad_no'].value_counts().reset_index()
yad_popularity.columns = ['yad_no', 'popularity']

# 全体での人気度ランキング
yad_popularity['overall_rank'] = yad_popularity['popularity'].rank(ascending=False)
# yado_df に人気度をマージ
yado_df = yado_df.merge(yad_popularity, on='yad_no', how='left')
# エリアごとの人気度ランキングを計算
for area in ['wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd']:
    yado_df[f'{area}_rank'] = yado_df.groupby(area)['popularity'].rank(ascending=False, method='min')

# 結果の確認
print(yado_df.head())

   yad_no  yad_type  total_room_cnt  wireless_lan_flg  onsen_flg  kd_stn_5min  \
0       1         0           129.0               1.0          0          1.0   
1       2         0            23.0               1.0          0          NaN   
2       3         0           167.0               1.0          1          1.0   
3       4         0           144.0               1.0          0          1.0   
4       5         0            41.0               1.0          1          NaN   

   kd_bch_5min  kd_slp_5min  kd_conv_walk_5min  \
0          NaN          NaN                1.0   
1          NaN          NaN                NaN   
2          NaN          NaN                1.0   
3          NaN          NaN                1.0   
4          NaN          NaN                NaN   

                             wid_cd                            ken_cd  \
0  f0112abf369fb03cdc5f5309300913da  072c85e1653e10c9c7dd065ad007125a   
1  d86102dd9c232bade9a97dccad40df48  b4d2fb4e51ea7bca80eb1270aa474

In [72]:
#　train_dfとtest_dfにyado_dfをマージ
train_df = pd.merge(train_df, yado_df.rename(columns={'yad_no': 'yado_no'}), on='yado_no', how='left')
test_df = pd.merge(test_df, yado_df.rename(columns={'yad_no': 'yado_no'}), on='yado_no', how='left')

In [73]:
train_df

Unnamed: 0,session_id,yado_no,target,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd,popularity,overall_rank,wid_cd_rank,ken_cd_rank,lrg_cd_rank,sml_cd_rank
0,000007603d533d30453cc45d0f3d119f,11882,0,-1,0,0,-1,-1,0,0,113.0,1.0,0,1.0,,,1.0,dc414a17890cfc17d011d5038b88ca93,d78f53d0856617bc782f02c3280dfef2,4fd631b15116098340cdb099c86a5a40,4044dac1931ddaa5a967e09506d76343,23.0,7189.5,891.0,286.0,25.0,21.0
1,000007603d533d30453cc45d0f3d119f,2808,0,-1,0,0,-1,-1,0,0,128.0,1.0,0,1.0,,,,dc414a17890cfc17d011d5038b88ca93,d78f53d0856617bc782f02c3280dfef2,4fd631b15116098340cdb099c86a5a40,4044dac1931ddaa5a967e09506d76343,36.0,5594.5,687.0,229.0,17.0,17.0
2,000007603d533d30453cc45d0f3d119f,5289,0,-1,0,0,-1,-1,0,0,66.0,1.0,0,1.0,,,1.0,dc414a17890cfc17d011d5038b88ca93,d78f53d0856617bc782f02c3280dfef2,4fd631b15116098340cdb099c86a5a40,4044dac1931ddaa5a967e09506d76343,14.0,8690.5,1073.0,334.0,37.0,33.0
3,000007603d533d30453cc45d0f3d119f,4101,1,-1,0,0,-1,-1,0,0,39.0,,0,,,,1.0,dc414a17890cfc17d011d5038b88ca93,d78f53d0856617bc782f02c3280dfef2,4fd631b15116098340cdb099c86a5a40,4044dac1931ddaa5a967e09506d76343,24.0,7045.0,874.0,280.0,23.0,19.0
4,000007603d533d30453cc45d0f3d119f,3324,0,-1,0,0,-1,-1,0,0,53.0,1.0,0,,,,1.0,dc414a17890cfc17d011d5038b88ca93,d78f53d0856617bc782f02c3280dfef2,4fd631b15116098340cdb099c86a5a40,4044dac1931ddaa5a967e09506d76343,15.0,8495.0,1053.0,330.0,35.0,31.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2886975,fffffa7baf370083ebcdd98f26a7e31a,846,0,-1,0,1,-1,-1,0,0,76.0,1.0,0,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,572d60f0f5212aacda515ebf81fb0a3a,8a623b960557e87bd1f4edf71b6255be,ab9480fd72a44d51690ab16c4ad4d49c,145.0,915.0,247.0,50.0,38.0,2.0
2886976,fffffa7baf370083ebcdd98f26a7e31a,9624,0,-1,0,1,-1,-1,0,0,42.0,1.0,0,,,,1.0,46e33861f921c3e38b81998fbf283f01,572d60f0f5212aacda515ebf81fb0a3a,8a623b960557e87bd1f4edf71b6255be,ab9480fd72a44d51690ab16c4ad4d49c,78.0,2747.5,657.0,96.0,49.0,6.0
2886977,fffffa7baf370083ebcdd98f26a7e31a,1372,0,-1,0,1,-1,-1,0,0,62.0,1.0,0,,,,1.0,46e33861f921c3e38b81998fbf283f01,572d60f0f5212aacda515ebf81fb0a3a,8a623b960557e87bd1f4edf71b6255be,ab9480fd72a44d51690ab16c4ad4d49c,119.0,1434.5,372.0,64.0,43.0,4.0
2886978,fffffa7baf370083ebcdd98f26a7e31a,5800,0,-1,0,1,-1,-1,0,0,137.0,1.0,0,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,572d60f0f5212aacda515ebf81fb0a3a,8a623b960557e87bd1f4edf71b6255be,1d9f09b9e2bd43cebc9885a46388739a,154.0,785.5,214.0,45.0,35.0,27.0


#### 各種Encoding

In [74]:
# カテゴリ変数と数値変数の明確化
cat_cols = ['wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd']

# 欠損値を-1で埋める
train_df.fillna(-1, inplace=True)
test_df.fillna(-1, inplace=True)

In [75]:
# Label Encoding

# testにしか存在しないものはNullにするので、trainのみをencoderにする
for col in cat_cols:
    encoder = {c: i for i, c in enumerate(train_df[col].unique())}
    train_df[f'label_{col}'] = train_df[col].map(encoder)
    test_df[f'label_{col}'] = test_df[col].map(encoder)

In [76]:
# Count Encoding（trainとtestをマージした方が良い？）←　した
# 増やした方が良い？

cat_cols = ['wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd'] + ['yado_no']

# testにしか存在しないものはNullにするので、trainのみをencoderにする
for col in cat_cols:
    encoder = train_df[col].value_counts()
    train_df[f'count_{col}'] = train_df[col].map(encoder)
    test_df[f'count_{col}'] = test_df[col].map(encoder)

In [77]:
# Target encoding
# 元の列順が保持されないから

cat_cols = ['wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd'] + ['yado_no']

# GroupKFold の設定  StratifiedGrouoKFoldでもいいかもしれない
gkf = GroupKFold(n_splits=5)

# fold 列を初期化
train_df['fold'] = -1

# 各 Fold に対してインデックスを割り当て
for fold, (train_idx, val_idx) in enumerate(gkf.split(train_df, groups=train_df['session_id'])):
    # バリデーションセットのインデックスに Fold 番号を割り当て
    train_df.loc[val_idx, 'fold'] = fold
# 結果の確認
print(train_df.head())



# TargetEncodingが完了した検証用データセットを格納するリスト
encoded_dfs = []

# すべての分割についてのループ
for fold in range(5):

  # 学習用と検証用データセットに分割する
  df_train = train_df[train_df.fold != fold].reset_index(drop=True)
  df_valid = train_df[train_df.fold == fold].reset_index(drop=True)
  
  # すべてのカテゴリについてのループ
  for column in cat_cols:
    # カテゴリごとの目的変数の統計量についての辞書を作成
    stats = df_train.groupby(column)['target'].agg(['mean', 'var'])
    
    # 平均、分散、最大値、最小値に対する列を作成
    for stat in ['mean', 'var']:
      mapping_dict = dict(stats[stat])
      df_valid.loc[:, f'TE_{column}_{stat}'] = df_valid[column].map(mapping_dict)

  # リストに格納
  encoded_dfs.append(df_valid)

# 結合したデータセットを返す
encoded_df = pd.concat(encoded_dfs, axis=0)


                         session_id  yado_no  target  seq_no  seen_yad  \
0  000007603d533d30453cc45d0f3d119f    11882       0      -1         0   
1  000007603d533d30453cc45d0f3d119f     2808       0      -1         0   
2  000007603d533d30453cc45d0f3d119f     5289       0      -1         0   
3  000007603d533d30453cc45d0f3d119f     4101       1      -1         0   
4  000007603d533d30453cc45d0f3d119f     3324       0      -1         0   

   max_seq_no  diff_seq_no  is_odd  multiple_visits_flag  yad_type  \
0           0           -1      -1                     0         0   
1           0           -1      -1                     0         0   
2           0           -1      -1                     0         0   
3           0           -1      -1                     0         0   
4           0           -1      -1                     0         0   

   total_room_cnt  wireless_lan_flg  onsen_flg  kd_stn_5min  kd_bch_5min  \
0           113.0               1.0          0          1.

In [79]:
train_df['fold'].value_counts()

2    577400
1    577400
0    577400
3    577390
4    577390
Name: fold, dtype: int64

In [78]:
encoded_df['fold'].value_counts()

0    577400
1    577400
2    577400
3    577390
4    577390
Name: fold, dtype: int64

In [80]:
train_df = encoded_df

In [81]:
# Testデータに対しても同様にTargetEncodingを行う
cat_cols = ['wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd'] + ['yado_no']

# すべてのカテゴリについてのループ
# train_df全体で計算する
for column in cat_cols:
    # カテゴリごとの目的変数の統計量についての辞書を作成
    stats = train_df.groupby(column)['target'].agg(['mean', 'var'])
    
    # 平均、分散、最大値、最小値に対する列を作成
    for stat in ['mean', 'var']:
        mapping_dict = dict(stats[stat])
        test_df.loc[:, f'TE_{column}_{stat}'] = test_df[column].map(mapping_dict)


In [82]:
test_df

Unnamed: 0,session_id,yado_no,target,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd,popularity,overall_rank,wid_cd_rank,ken_cd_rank,lrg_cd_rank,sml_cd_rank,label_wid_cd,label_ken_cd,label_lrg_cd,label_sml_cd,count_wid_cd,count_ken_cd,count_lrg_cd,count_sml_cd,count_yado_no,TE_wid_cd_mean,TE_wid_cd_var,TE_ken_cd_mean,TE_ken_cd_var,TE_lrg_cd_mean,TE_lrg_cd_var,TE_sml_cd_mean,TE_sml_cd_var,TE_yado_no_mean,TE_yado_no_var
0,00001149e9c73985425197104712478c,3560,0,0,1,1,1,0,0,0,205.0,1.0,0,-1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,86.0,2418.5,588.0,339.0,22.0,10.0,4,8,93,142,674516,384957,23029,9672,544.0,0.063105,0.059122,0.055998,0.052863,0.061835,0.058014,0.059967,0.056377,0.049632,0.047256
1,00001149e9c73985425197104712478c,4545,0,-1,0,1,-1,-1,0,0,186.0,-1.0,0,-1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,58.0,3804.5,864.0,505.0,35.0,20.0,4,8,93,142,674516,384957,23029,9672,392.0,0.063105,0.059122,0.055998,0.052863,0.061835,0.058014,0.059967,0.056377,0.079082,0.073014
2,00001149e9c73985425197104712478c,9534,0,-1,0,1,-1,-1,0,0,136.0,1.0,0,-1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,180.0,536.0,153.0,67.0,6.0,2.0,4,8,93,142,674516,384957,23029,9672,802.0,0.063105,0.059122,0.055998,0.052863,0.061835,0.058014,0.059967,0.056377,0.056110,0.053028
3,00001149e9c73985425197104712478c,5785,0,-1,0,1,-1,-1,0,0,225.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,82.0,2579.0,618.0,360.0,25.0,12.0,4,8,93,142,674516,384957,23029,9672,363.0,0.063105,0.059122,0.055998,0.052863,0.061835,0.058014,0.059967,0.056377,0.044077,0.042251
4,00001149e9c73985425197104712478c,6563,0,-1,0,1,-1,-1,0,0,408.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,113.0,1569.5,407.0,218.0,15.0,6.0,4,8,93,142,674516,384957,23029,9672,413.0,0.063105,0.059122,0.055998,0.052863,0.061835,0.058014,0.059967,0.056377,0.050847,0.048379
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1746995,ffffe984aafd6127ce8e43e3ca40c79d,5623,0,-1,0,0,-1,-1,0,0,178.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,c9d5e891463e5389c42d16f987ed30bd,7cf2b4f31fb20747f89e58981e6d9fc1,74.0,2931.5,690.0,402.0,34.0,11.0,4,8,47,130,674516,384957,35281,12803,463.0,0.063105,0.059122,0.055998,0.052863,0.063972,0.059881,0.062329,0.058449,0.073434,0.068189
1746996,ffffe984aafd6127ce8e43e3ca40c79d,11994,0,-1,0,0,-1,-1,0,0,334.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,c9d5e891463e5389c42d16f987ed30bd,372d41b6f39f1f523d0841fd9b84ae44,135.0,1096.5,295.0,147.0,8.0,3.0,4,8,47,66,674516,384957,35281,5408,549.0,0.063105,0.059122,0.055998,0.052863,0.063972,0.059881,0.067493,0.062949,0.071038,0.066112
1746997,ffffe984aafd6127ce8e43e3ca40c79d,3781,0,-1,0,0,-1,-1,0,0,245.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,c9d5e891463e5389c42d16f987ed30bd,9ccc341413e935a914a1ded367b8f80e,183.0,514.0,149.0,65.0,2.0,1.0,4,8,47,129,674516,384957,35281,14675,838.0,0.063105,0.059122,0.055998,0.052863,0.063972,0.059881,0.062283,0.058408,0.057279,0.054063
1746998,ffffe984aafd6127ce8e43e3ca40c79d,634,0,-1,0,0,-1,-1,0,0,174.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,c9d5e891463e5389c42d16f987ed30bd,7cf2b4f31fb20747f89e58981e6d9fc1,62.0,3566.0,814.0,477.0,46.0,14.0,4,8,47,130,674516,384957,35281,12803,288.0,0.063105,0.059122,0.055998,0.052863,0.063972,0.059881,0.062329,0.058449,0.076389,0.070799


In [83]:
all_features = train_df.columns.to_list()
features= [x for x in all_features if x not in ('session_id', 'fold', 'target', 'wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd')]

In [84]:
train_df[features]

Unnamed: 0,yado_no,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,popularity,overall_rank,wid_cd_rank,ken_cd_rank,lrg_cd_rank,sml_cd_rank,label_wid_cd,label_ken_cd,label_lrg_cd,label_sml_cd,count_wid_cd,count_ken_cd,count_lrg_cd,count_sml_cd,count_yado_no,TE_wid_cd_mean,TE_wid_cd_var,TE_ken_cd_mean,TE_ken_cd_var,TE_lrg_cd_mean,TE_lrg_cd_var,TE_sml_cd_mean,TE_sml_cd_var,TE_yado_no_mean,TE_yado_no_var
0,96,0,1,1,1,0,0,0,228.0,1.0,0,-1.0,-1.0,-1.0,1.0,150.0,845.5,113.0,10.0,6.0,6.0,6,9,12,15,355660,35814,14299,14011,626,0.060860,0.057156,0.078427,0.072278,0.068568,0.063872,0.066380,0.061979,0.038986,0.037540
1,902,-1,0,1,-1,-1,0,0,240.0,1.0,0,-1.0,-1.0,-1.0,1.0,154.0,785.5,103.0,9.0,5.0,5.0,6,9,12,15,355660,35814,14299,14011,639,0.060860,0.057156,0.078427,0.072278,0.068568,0.063872,0.066380,0.061979,0.067568,0.063124
2,5490,-1,0,1,-1,-1,0,0,116.0,1.0,0,-1.0,-1.0,-1.0,1.0,143.0,945.0,126.0,13.0,9.0,9.0,6,9,12,15,355660,35814,14299,14011,848,0.060860,0.057156,0.078427,0.072278,0.068568,0.063872,0.066380,0.061979,0.053901,0.051068
3,1284,-1,0,1,-1,-1,0,0,176.0,1.0,0,-1.0,-1.0,-1.0,1.0,141.0,979.5,128.0,14.0,10.0,10.0,6,9,12,15,355660,35814,14299,14011,733,0.060860,0.057156,0.078427,0.072278,0.068568,0.063872,0.066380,0.061979,0.054187,0.051335
4,12491,-1,0,1,-1,-1,0,0,144.0,1.0,0,-1.0,-1.0,-1.0,1.0,179.0,547.5,58.0,5.0,2.0,2.0,6,9,12,15,355660,35814,14299,14011,830,0.060860,0.057156,0.078427,0.072278,0.068568,0.063872,0.066380,0.061979,0.069630,0.064877
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
577385,12544,-1,0,1,-1,-1,0,0,122.0,-1.0,0,1.0,-1.0,-1.0,1.0,59.0,3745.0,125.0,14.0,14.0,12.0,7,10,13,17,88600,11219,10395,8972,292,0.072155,0.066949,0.087579,0.079918,0.080724,0.074216,0.076849,0.070953,0.071429,0.066606
577386,12005,-1,0,1,-1,-1,0,0,128.0,1.0,0,1.0,-1.0,-1.0,1.0,56.0,3941.5,134.0,18.0,18.0,15.0,7,10,13,17,88600,11219,10395,8972,163,0.072155,0.066949,0.087579,0.079918,0.080724,0.074216,0.076849,0.070953,0.055556,0.052889
577387,2068,-1,0,1,-1,-1,0,0,46.0,-1.0,0,1.0,-1.0,-1.0,1.0,52.0,4208.5,146.0,20.0,20.0,16.0,7,10,13,17,88600,11219,10395,8972,189,0.072155,0.066949,0.087579,0.079918,0.080724,0.074216,0.076849,0.070953,0.089172,0.081741
577388,5076,-1,0,1,-1,-1,0,0,207.0,1.0,0,-1.0,-1.0,-1.0,1.0,137.0,1056.0,28.0,4.0,4.0,4.0,7,10,13,17,88600,11219,10395,8972,704,0.072155,0.066949,0.087579,0.079918,0.080724,0.074216,0.076849,0.070953,0.069686,0.064943


In [85]:
test_df[features]

Unnamed: 0,yado_no,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,popularity,overall_rank,wid_cd_rank,ken_cd_rank,lrg_cd_rank,sml_cd_rank,label_wid_cd,label_ken_cd,label_lrg_cd,label_sml_cd,count_wid_cd,count_ken_cd,count_lrg_cd,count_sml_cd,count_yado_no,TE_wid_cd_mean,TE_wid_cd_var,TE_ken_cd_mean,TE_ken_cd_var,TE_lrg_cd_mean,TE_lrg_cd_var,TE_sml_cd_mean,TE_sml_cd_var,TE_yado_no_mean,TE_yado_no_var
0,3560,0,1,1,1,0,0,0,205.0,1.0,0,-1.0,-1.0,-1.0,1.0,86.0,2418.5,588.0,339.0,22.0,10.0,4,8,93,142,674516,384957,23029,9672,544.0,0.063105,0.059122,0.055998,0.052863,0.061835,0.058014,0.059967,0.056377,0.049632,0.047256
1,4545,-1,0,1,-1,-1,0,0,186.0,-1.0,0,-1.0,-1.0,-1.0,1.0,58.0,3804.5,864.0,505.0,35.0,20.0,4,8,93,142,674516,384957,23029,9672,392.0,0.063105,0.059122,0.055998,0.052863,0.061835,0.058014,0.059967,0.056377,0.079082,0.073014
2,9534,-1,0,1,-1,-1,0,0,136.0,1.0,0,-1.0,-1.0,-1.0,1.0,180.0,536.0,153.0,67.0,6.0,2.0,4,8,93,142,674516,384957,23029,9672,802.0,0.063105,0.059122,0.055998,0.052863,0.061835,0.058014,0.059967,0.056377,0.056110,0.053028
3,5785,-1,0,1,-1,-1,0,0,225.0,1.0,0,1.0,-1.0,-1.0,1.0,82.0,2579.0,618.0,360.0,25.0,12.0,4,8,93,142,674516,384957,23029,9672,363.0,0.063105,0.059122,0.055998,0.052863,0.061835,0.058014,0.059967,0.056377,0.044077,0.042251
4,6563,-1,0,1,-1,-1,0,0,408.0,1.0,0,1.0,-1.0,-1.0,1.0,113.0,1569.5,407.0,218.0,15.0,6.0,4,8,93,142,674516,384957,23029,9672,413.0,0.063105,0.059122,0.055998,0.052863,0.061835,0.058014,0.059967,0.056377,0.050847,0.048379
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1746995,5623,-1,0,0,-1,-1,0,0,178.0,1.0,0,1.0,-1.0,-1.0,1.0,74.0,2931.5,690.0,402.0,34.0,11.0,4,8,47,130,674516,384957,35281,12803,463.0,0.063105,0.059122,0.055998,0.052863,0.063972,0.059881,0.062329,0.058449,0.073434,0.068189
1746996,11994,-1,0,0,-1,-1,0,0,334.0,1.0,0,1.0,-1.0,-1.0,1.0,135.0,1096.5,295.0,147.0,8.0,3.0,4,8,47,66,674516,384957,35281,5408,549.0,0.063105,0.059122,0.055998,0.052863,0.063972,0.059881,0.067493,0.062949,0.071038,0.066112
1746997,3781,-1,0,0,-1,-1,0,0,245.0,1.0,0,1.0,-1.0,-1.0,1.0,183.0,514.0,149.0,65.0,2.0,1.0,4,8,47,129,674516,384957,35281,14675,838.0,0.063105,0.059122,0.055998,0.052863,0.063972,0.059881,0.062283,0.058408,0.057279,0.054063
1746998,634,-1,0,0,-1,-1,0,0,174.0,1.0,0,1.0,-1.0,-1.0,1.0,62.0,3566.0,814.0,477.0,46.0,14.0,4,8,47,130,674516,384957,35281,12803,288.0,0.063105,0.059122,0.055998,0.052863,0.063972,0.059881,0.062329,0.058449,0.076389,0.070799


In [86]:
train_df.columns.to_list()

['session_id',
 'yado_no',
 'target',
 'seq_no',
 'seen_yad',
 'max_seq_no',
 'diff_seq_no',
 'is_odd',
 'multiple_visits_flag',
 'yad_type',
 'total_room_cnt',
 'wireless_lan_flg',
 'onsen_flg',
 'kd_stn_5min',
 'kd_bch_5min',
 'kd_slp_5min',
 'kd_conv_walk_5min',
 'wid_cd',
 'ken_cd',
 'lrg_cd',
 'sml_cd',
 'popularity',
 'overall_rank',
 'wid_cd_rank',
 'ken_cd_rank',
 'lrg_cd_rank',
 'sml_cd_rank',
 'label_wid_cd',
 'label_ken_cd',
 'label_lrg_cd',
 'label_sml_cd',
 'count_wid_cd',
 'count_ken_cd',
 'count_lrg_cd',
 'count_sml_cd',
 'count_yado_no',
 'fold',
 'TE_wid_cd_mean',
 'TE_wid_cd_var',
 'TE_ken_cd_mean',
 'TE_ken_cd_var',
 'TE_lrg_cd_mean',
 'TE_lrg_cd_var',
 'TE_sml_cd_mean',
 'TE_sml_cd_var',
 'TE_yado_no_mean',
 'TE_yado_no_var']

In [87]:
import gc
gc.collect()

105

In [88]:
test_df

Unnamed: 0,session_id,yado_no,target,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd,popularity,overall_rank,wid_cd_rank,ken_cd_rank,lrg_cd_rank,sml_cd_rank,label_wid_cd,label_ken_cd,label_lrg_cd,label_sml_cd,count_wid_cd,count_ken_cd,count_lrg_cd,count_sml_cd,count_yado_no,TE_wid_cd_mean,TE_wid_cd_var,TE_ken_cd_mean,TE_ken_cd_var,TE_lrg_cd_mean,TE_lrg_cd_var,TE_sml_cd_mean,TE_sml_cd_var,TE_yado_no_mean,TE_yado_no_var
0,00001149e9c73985425197104712478c,3560,0,0,1,1,1,0,0,0,205.0,1.0,0,-1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,86.0,2418.5,588.0,339.0,22.0,10.0,4,8,93,142,674516,384957,23029,9672,544.0,0.063105,0.059122,0.055998,0.052863,0.061835,0.058014,0.059967,0.056377,0.049632,0.047256
1,00001149e9c73985425197104712478c,4545,0,-1,0,1,-1,-1,0,0,186.0,-1.0,0,-1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,58.0,3804.5,864.0,505.0,35.0,20.0,4,8,93,142,674516,384957,23029,9672,392.0,0.063105,0.059122,0.055998,0.052863,0.061835,0.058014,0.059967,0.056377,0.079082,0.073014
2,00001149e9c73985425197104712478c,9534,0,-1,0,1,-1,-1,0,0,136.0,1.0,0,-1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,180.0,536.0,153.0,67.0,6.0,2.0,4,8,93,142,674516,384957,23029,9672,802.0,0.063105,0.059122,0.055998,0.052863,0.061835,0.058014,0.059967,0.056377,0.056110,0.053028
3,00001149e9c73985425197104712478c,5785,0,-1,0,1,-1,-1,0,0,225.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,82.0,2579.0,618.0,360.0,25.0,12.0,4,8,93,142,674516,384957,23029,9672,363.0,0.063105,0.059122,0.055998,0.052863,0.061835,0.058014,0.059967,0.056377,0.044077,0.042251
4,00001149e9c73985425197104712478c,6563,0,-1,0,1,-1,-1,0,0,408.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,113.0,1569.5,407.0,218.0,15.0,6.0,4,8,93,142,674516,384957,23029,9672,413.0,0.063105,0.059122,0.055998,0.052863,0.061835,0.058014,0.059967,0.056377,0.050847,0.048379
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1746995,ffffe984aafd6127ce8e43e3ca40c79d,5623,0,-1,0,0,-1,-1,0,0,178.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,c9d5e891463e5389c42d16f987ed30bd,7cf2b4f31fb20747f89e58981e6d9fc1,74.0,2931.5,690.0,402.0,34.0,11.0,4,8,47,130,674516,384957,35281,12803,463.0,0.063105,0.059122,0.055998,0.052863,0.063972,0.059881,0.062329,0.058449,0.073434,0.068189
1746996,ffffe984aafd6127ce8e43e3ca40c79d,11994,0,-1,0,0,-1,-1,0,0,334.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,c9d5e891463e5389c42d16f987ed30bd,372d41b6f39f1f523d0841fd9b84ae44,135.0,1096.5,295.0,147.0,8.0,3.0,4,8,47,66,674516,384957,35281,5408,549.0,0.063105,0.059122,0.055998,0.052863,0.063972,0.059881,0.067493,0.062949,0.071038,0.066112
1746997,ffffe984aafd6127ce8e43e3ca40c79d,3781,0,-1,0,0,-1,-1,0,0,245.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,c9d5e891463e5389c42d16f987ed30bd,9ccc341413e935a914a1ded367b8f80e,183.0,514.0,149.0,65.0,2.0,1.0,4,8,47,129,674516,384957,35281,14675,838.0,0.063105,0.059122,0.055998,0.052863,0.063972,0.059881,0.062283,0.058408,0.057279,0.054063
1746998,ffffe984aafd6127ce8e43e3ca40c79d,634,0,-1,0,0,-1,-1,0,0,174.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,c9d5e891463e5389c42d16f987ed30bd,7cf2b4f31fb20747f89e58981e6d9fc1,62.0,3566.0,814.0,477.0,46.0,14.0,4,8,47,130,674516,384957,35281,12803,288.0,0.063105,0.059122,0.055998,0.052863,0.063972,0.059881,0.062329,0.058449,0.076389,0.070799


In [89]:
train_df

Unnamed: 0,session_id,yado_no,target,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd,popularity,overall_rank,wid_cd_rank,ken_cd_rank,lrg_cd_rank,sml_cd_rank,label_wid_cd,label_ken_cd,label_lrg_cd,label_sml_cd,count_wid_cd,count_ken_cd,count_lrg_cd,count_sml_cd,count_yado_no,fold,TE_wid_cd_mean,TE_wid_cd_var,TE_ken_cd_mean,TE_ken_cd_var,TE_lrg_cd_mean,TE_lrg_cd_var,TE_sml_cd_mean,TE_sml_cd_var,TE_yado_no_mean,TE_yado_no_var
0,000104bdffaaad1a1e0a9ebacf585f33,96,1,0,1,1,1,0,0,0,228.0,1.0,0,-1.0,-1.0,-1.0,1.0,e9316013ee1b03f4525fe361c46ce9c5,84efa50e52f9b471c95bfc3b21b854ad,a1370d90ed3b80ee41311bbbab46aec9,d72674f02c5340d90f245e3177727650,150.0,845.5,113.0,10.0,6.0,6.0,6,9,12,15,355660,35814,14299,14011,626,0,0.060860,0.057156,0.078427,0.072278,0.068568,0.063872,0.066380,0.061979,0.038986,0.037540
1,000104bdffaaad1a1e0a9ebacf585f33,902,0,-1,0,1,-1,-1,0,0,240.0,1.0,0,-1.0,-1.0,-1.0,1.0,e9316013ee1b03f4525fe361c46ce9c5,84efa50e52f9b471c95bfc3b21b854ad,a1370d90ed3b80ee41311bbbab46aec9,d72674f02c5340d90f245e3177727650,154.0,785.5,103.0,9.0,5.0,5.0,6,9,12,15,355660,35814,14299,14011,639,0,0.060860,0.057156,0.078427,0.072278,0.068568,0.063872,0.066380,0.061979,0.067568,0.063124
2,000104bdffaaad1a1e0a9ebacf585f33,5490,0,-1,0,1,-1,-1,0,0,116.0,1.0,0,-1.0,-1.0,-1.0,1.0,e9316013ee1b03f4525fe361c46ce9c5,84efa50e52f9b471c95bfc3b21b854ad,a1370d90ed3b80ee41311bbbab46aec9,d72674f02c5340d90f245e3177727650,143.0,945.0,126.0,13.0,9.0,9.0,6,9,12,15,355660,35814,14299,14011,848,0,0.060860,0.057156,0.078427,0.072278,0.068568,0.063872,0.066380,0.061979,0.053901,0.051068
3,000104bdffaaad1a1e0a9ebacf585f33,1284,0,-1,0,1,-1,-1,0,0,176.0,1.0,0,-1.0,-1.0,-1.0,1.0,e9316013ee1b03f4525fe361c46ce9c5,84efa50e52f9b471c95bfc3b21b854ad,a1370d90ed3b80ee41311bbbab46aec9,d72674f02c5340d90f245e3177727650,141.0,979.5,128.0,14.0,10.0,10.0,6,9,12,15,355660,35814,14299,14011,733,0,0.060860,0.057156,0.078427,0.072278,0.068568,0.063872,0.066380,0.061979,0.054187,0.051335
4,000104bdffaaad1a1e0a9ebacf585f33,12491,0,-1,0,1,-1,-1,0,0,144.0,1.0,0,-1.0,-1.0,-1.0,1.0,e9316013ee1b03f4525fe361c46ce9c5,84efa50e52f9b471c95bfc3b21b854ad,a1370d90ed3b80ee41311bbbab46aec9,d72674f02c5340d90f245e3177727650,179.0,547.5,58.0,5.0,2.0,2.0,6,9,12,15,355660,35814,14299,14011,830,0,0.060860,0.057156,0.078427,0.072278,0.068568,0.063872,0.066380,0.061979,0.069630,0.064877
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
577385,fffe8c99c5b332190c3d4a2d6e7c5073,12544,0,-1,0,1,-1,-1,0,0,122.0,-1.0,0,1.0,-1.0,-1.0,1.0,43875109d1dab93592812c50d18270a7,7d3db9a7acad537c322f85f7cef0beda,34f448c0371e84f8fe5a079d8d04211e,97de2b919a8ec19e656e5913804b0c84,59.0,3745.0,125.0,14.0,14.0,12.0,7,10,13,17,88600,11219,10395,8972,292,4,0.072155,0.066949,0.087579,0.079918,0.080724,0.074216,0.076849,0.070953,0.071429,0.066606
577386,fffe8c99c5b332190c3d4a2d6e7c5073,12005,0,-1,0,1,-1,-1,0,0,128.0,1.0,0,1.0,-1.0,-1.0,1.0,43875109d1dab93592812c50d18270a7,7d3db9a7acad537c322f85f7cef0beda,34f448c0371e84f8fe5a079d8d04211e,97de2b919a8ec19e656e5913804b0c84,56.0,3941.5,134.0,18.0,18.0,15.0,7,10,13,17,88600,11219,10395,8972,163,4,0.072155,0.066949,0.087579,0.079918,0.080724,0.074216,0.076849,0.070953,0.055556,0.052889
577387,fffe8c99c5b332190c3d4a2d6e7c5073,2068,0,-1,0,1,-1,-1,0,0,46.0,-1.0,0,1.0,-1.0,-1.0,1.0,43875109d1dab93592812c50d18270a7,7d3db9a7acad537c322f85f7cef0beda,34f448c0371e84f8fe5a079d8d04211e,97de2b919a8ec19e656e5913804b0c84,52.0,4208.5,146.0,20.0,20.0,16.0,7,10,13,17,88600,11219,10395,8972,189,4,0.072155,0.066949,0.087579,0.079918,0.080724,0.074216,0.076849,0.070953,0.089172,0.081741
577388,fffe8c99c5b332190c3d4a2d6e7c5073,5076,0,-1,0,1,-1,-1,0,0,207.0,1.0,0,-1.0,-1.0,-1.0,1.0,43875109d1dab93592812c50d18270a7,7d3db9a7acad537c322f85f7cef0beda,34f448c0371e84f8fe5a079d8d04211e,97de2b919a8ec19e656e5913804b0c84,137.0,1056.0,28.0,4.0,4.0,4.0,7,10,13,17,88600,11219,10395,8972,704,4,0.072155,0.066949,0.087579,0.079918,0.080724,0.074216,0.076849,0.070953,0.069686,0.064943


In [90]:
train_df.to_parquet('../data/feature_engineering_v4_train_df.parquet', index=False)
test_df.to_parquet('../data/feature_engineering_v4_test_df.parquet', index=False)