In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm


from sklearn.model_selection import KFold
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import StratifiedKFold
from nyaggle.feature.category_encoder import TargetEncoder

from nyaggle.experiment import run_experiment

In [2]:
train_log_df = pd.read_csv('../train_log.csv')
test_log_df = pd.read_csv('../test_log.csv')
train_label_df = pd.read_csv('../train_label.csv')
test_session_df = pd.read_csv('../test_session.csv')
yado_df = pd.read_csv('../yado.csv')
# image_embeddings_df = pd.read_parquet('../image_embeddings.parquet')
sample_submission_df = pd.read_csv('../sample_submission.csv')

candidate_train_df = pd.read_csv('../data/candidate_ver7_train.csv')
candidate_test_df = pd.read_csv('../data/candidate_ver7_test.csv')

#### Trainに正例と負例のフラグを付与

In [3]:
# 学習用データ
# 教師データに正例と負例のフラグを付与
train_target = []

# 各セッションに対して候補の宿をチェック
for index, row in tqdm(train_label_df.iterrows(), total=train_label_df.shape[0]):
    session_id = row['session_id']
    yad_no = row['yad_no']
    
    # candidate_ver7_train.csv の該当行を取得（同じインデックスを仮定）
    candidate_row = candidate_train_df.iloc[index]

    # 50個の購入候補の宿に対してチェック
    for col in candidate_train_df.columns:
        yado_no = candidate_row[col]
        is_correct = (yado_no == yad_no)  # 一致するかどうかのチェック
        train_target.append([session_id, yado_no, is_correct])

# 結果をDataFrameに変換
train_df = pd.DataFrame(train_target, columns=['session_id', 'yado_no', 'target']) #区別するためにあえてyad_noではなくyado_noとしている
train_df['target'] = train_df['target'].astype(int)  # target列をint型に変換

# 最初の数行を表示して確認
print(train_df.head())

print('正例と負例の数を確認')
print(train_df['target'].value_counts())

100%|██████████| 288698/288698 [01:21<00:00, 3557.96it/s]


                         session_id  yado_no  target
0  000007603d533d30453cc45d0f3d119f    11882       0
1  000007603d533d30453cc45d0f3d119f     2808       0
2  000007603d533d30453cc45d0f3d119f     4101       1
3  000007603d533d30453cc45d0f3d119f     5289       0
4  000007603d533d30453cc45d0f3d119f     9187       0
正例と負例の数を確認
0    14200708
1      234192
Name: target, dtype: int64


#### 推論用データの作成

In [4]:
# 推論用データにダミーのyado_noを付与
test_session_df['yad_no'] = -1

test_target = []

# 各セッションに対して候補の宿をチェック
for index, row in tqdm(test_session_df.iterrows(), total=test_session_df.shape[0]):
    session_id = row['session_id']
    yad_no = row['yad_no']
    
    # candidate_ver7_test.csv の該当行を取得（同じインデックスを仮定）
    candidate_row = candidate_test_df.iloc[index]

    # 50個の購入候補の宿に対してチェック
    for col in candidate_test_df.columns:
        yado_no = candidate_row[col]
        is_correct = (yado_no == yad_no)  # 一致するかどうかのチェック
        test_target.append([session_id, yado_no, is_correct])

# 結果をDataFrameに変換
test_df = pd.DataFrame(test_target, columns=['session_id', 'yado_no', 'target']) #区別するためにあえてyad_noではなくyado_noとしている
test_df['target'] = test_df['target'].astype(int)  # target列をint型に変換

# 最初の数行を表示して確認
print(test_df.head())

print('正例と負例の数を確認')
print(test_df['target'].value_counts()) # すべて0になるはず

100%|██████████| 174700/174700 [00:48<00:00, 3595.15it/s]


                         session_id  yado_no  target
0  00001149e9c73985425197104712478c     3560       0
1  00001149e9c73985425197104712478c     4545       0
2  00001149e9c73985425197104712478c     9534       0
3  00001149e9c73985425197104712478c     6563       0
4  00001149e9c73985425197104712478c     4420       0
正例と負例の数を確認
0    8735000
Name: target, dtype: int64


#### seq_noを追加

In [5]:
# seq_noをマージする

# train_log の session_id と yad_no の組み合わせに基づいて辞書を作成
seq_no_dict_train = dict(zip(zip(train_log_df['session_id'], train_log_df['yad_no']), train_log_df['seq_no']))
# train_df に seq_no 列を追加（tqdm で進捗表示）
train_df['seq_no'] = [seq_no_dict_train.get((row['session_id'], row['yado_no']), -1) for row in tqdm(train_df.to_dict('records'))]
# 結果の確認
print(train_df.head())


# test_log の session_id と yad_no の組み合わせに基づいて辞書を作成
seq_no_dict_test = dict(zip(zip(test_log_df['session_id'], test_log_df['yad_no']), test_log_df['seq_no']))
# test_df に seq_no 列を追加（tqdm で進捗表示）
test_df['seq_no'] = [seq_no_dict_test.get((row['session_id'], row['yado_no']), -1) for row in tqdm(test_df.to_dict('records'))]
# 結果の確認
print(test_df.head())

100%|██████████| 14434900/14434900 [00:05<00:00, 2642375.62it/s]


                         session_id  yado_no  target  seq_no
0  000007603d533d30453cc45d0f3d119f    11882       0      -1
1  000007603d533d30453cc45d0f3d119f     2808       0      -1
2  000007603d533d30453cc45d0f3d119f     4101       1      -1
3  000007603d533d30453cc45d0f3d119f     5289       0      -1
4  000007603d533d30453cc45d0f3d119f     9187       0      -1


100%|██████████| 8735000/8735000 [00:03<00:00, 2777383.86it/s]


                         session_id  yado_no  target  seq_no
0  00001149e9c73985425197104712478c     3560       0       0
1  00001149e9c73985425197104712478c     4545       0      -1
2  00001149e9c73985425197104712478c     9534       0      -1
3  00001149e9c73985425197104712478c     6563       0      -1
4  00001149e9c73985425197104712478c     4420       0      -1


In [6]:
#　※複数のseq_noがある場合は、0も混入する
print(train_df['seq_no'].value_counts())
print(test_df['seq_no'].value_counts())

-1    14325447
 0       88128
 1       16052
 2        4125
 3         837
 4         223
 5          65
 6          18
 7           4
 8           1
Name: seq_no, dtype: int64
-1    8671604
 0      51884
 1       8692
 2       2239
 3        428
 4        124
 5         22
 6          7
Name: seq_no, dtype: int64


#### 候補の宿がsession中に閲覧されていたかどうか

In [7]:
# 'seq_no'が−1でなければ1のフラグを立てる
train_df['seen_yad'] = train_df['seq_no'].apply(lambda x: 1 if x != -1 else 0)
test_df['seen_yad'] = test_df['seq_no'].apply(lambda x: 1 if x != -1 else 0)

In [8]:
print(train_df['seen_yad'].value_counts())  
print(test_df['seen_yad'].value_counts())

0    14325447
1      109453
Name: seen_yad, dtype: int64
0    8671604
1      63396
Name: seen_yad, dtype: int64


#### 各sessionにおける最大seq_no

In [9]:
# 各セッションの seq_no の最大値を計算
max_seq_no_train = train_log_df.groupby('session_id')['seq_no'].max().reset_index()
max_seq_no_train.rename(columns={'seq_no': 'max_seq_no'}, inplace=True)
# teacher_df と max_seq_no のマージ
train_df = train_df.merge(max_seq_no_train, on='session_id', how='left')
# 結果の確認
print(train_df.head())


# 各セッションの seq_no の最大値を計算
max_seq_no_test = test_log_df.groupby('session_id')['seq_no'].max().reset_index()
max_seq_no_test.rename(columns={'seq_no': 'max_seq_no'}, inplace=True)
# teacher_df と max_seq_no のマージ
test_df = test_df.merge(max_seq_no_test, on='session_id', how='left')
# 結果の確認
print(test_df.head())

                         session_id  yado_no  target  seq_no  seen_yad  \
0  000007603d533d30453cc45d0f3d119f    11882       0      -1         0   
1  000007603d533d30453cc45d0f3d119f     2808       0      -1         0   
2  000007603d533d30453cc45d0f3d119f     4101       1      -1         0   
3  000007603d533d30453cc45d0f3d119f     5289       0      -1         0   
4  000007603d533d30453cc45d0f3d119f     9187       0      -1         0   

   max_seq_no  
0           0  
1           0  
2           0  
3           0  
4           0  
                         session_id  yado_no  target  seq_no  seen_yad  \
0  00001149e9c73985425197104712478c     3560       0       0         1   
1  00001149e9c73985425197104712478c     4545       0      -1         0   
2  00001149e9c73985425197104712478c     9534       0      -1         0   
3  00001149e9c73985425197104712478c     6563       0      -1         0   
4  00001149e9c73985425197104712478c     4420       0      -1         0   

   max_seq_no 

In [10]:
print(train_df['max_seq_no'].value_counts())
print(test_df['max_seq_no'].value_counts())

0    9269300
1    4139650
2     767500
3     201250
4      41650
5      11150
6       3250
7        900
8        200
9         50
Name: max_seq_no, dtype: int64
0    5697000
1    2474650
2     422950
3     111350
4      21400
5       6200
6       1100
7        350
Name: max_seq_no, dtype: int64


#### 差分の考慮：max_seq_noから(-1以外の要素)でseq_noを引く

In [11]:
# 最初に全ての diff_seq_no を -1 に設定
train_df['diff_seq_no'] = -1
test_df['diff_seq_no'] = -1

# seq_no が -1 以外の行にのみ max_seq_no - seq_no の計算を適用
train_df.loc[train_df['seq_no'] != -1, 'diff_seq_no'] = train_df['max_seq_no'] - train_df['seq_no']
test_df.loc[test_df['seq_no'] != -1, 'diff_seq_no'] = test_df['max_seq_no'] - test_df['seq_no']

# 結果の確認
print(train_df.head())
print(test_df.head())

                         session_id  yado_no  target  seq_no  seen_yad  \
0  000007603d533d30453cc45d0f3d119f    11882       0      -1         0   
1  000007603d533d30453cc45d0f3d119f     2808       0      -1         0   
2  000007603d533d30453cc45d0f3d119f     4101       1      -1         0   
3  000007603d533d30453cc45d0f3d119f     5289       0      -1         0   
4  000007603d533d30453cc45d0f3d119f     9187       0      -1         0   

   max_seq_no  diff_seq_no  
0           0           -1  
1           0           -1  
2           0           -1  
3           0           -1  
4           0           -1  
                         session_id  yado_no  target  seq_no  seen_yad  \
0  00001149e9c73985425197104712478c     3560       0       0         1   
1  00001149e9c73985425197104712478c     4545       0      -1         0   
2  00001149e9c73985425197104712478c     9534       0      -1         0   
3  00001149e9c73985425197104712478c     6563       0      -1         0   
4  00001149

In [12]:
print(train_df['diff_seq_no'].value_counts())
print(test_df['diff_seq_no'].value_counts())

-1    14325447
 1      103312
 2        5684
 3         407
 4          49
 5           1
Name: diff_seq_no, dtype: int64
-1    8671604
 1      60760
 2       2528
 3        106
 4          2
Name: diff_seq_no, dtype: int64


#### diff_seq_no が奇数かどうかの判定 

In [13]:
# diff_seq_no が奇数かどうかの判定（seq_no が -1 の場合は除外）
train_df['is_odd'] = np.where(train_df['seq_no'] != -1, train_df['diff_seq_no'] % 2 == 1, -1)
test_df['is_odd'] = np.where(test_df['seq_no'] != -1, test_df['diff_seq_no'] % 2 == 1, -1)

# seq_no が -1 以外の場合、Trueを0、Falseを1に変換（すでに -1 の場合は変更しない）
train_df['is_odd'] = np.where(train_df['is_odd'] != -1, np.where(train_df['is_odd'], 0, 1), -1)
test_df['is_odd'] = np.where(test_df['is_odd'] != -1, np.where(test_df['is_odd'], 0, 1), -1)

# 結果の確認
print(train_df.head())
print(test_df.head())

                         session_id  yado_no  target  seq_no  seen_yad  \
0  000007603d533d30453cc45d0f3d119f    11882       0      -1         0   
1  000007603d533d30453cc45d0f3d119f     2808       0      -1         0   
2  000007603d533d30453cc45d0f3d119f     4101       1      -1         0   
3  000007603d533d30453cc45d0f3d119f     5289       0      -1         0   
4  000007603d533d30453cc45d0f3d119f     9187       0      -1         0   

   max_seq_no  diff_seq_no  is_odd  
0           0           -1      -1  
1           0           -1      -1  
2           0           -1      -1  
3           0           -1      -1  
4           0           -1      -1  
                         session_id  yado_no  target  seq_no  seen_yad  \
0  00001149e9c73985425197104712478c     3560       0       0         1   
1  00001149e9c73985425197104712478c     4545       0      -1         0   
2  00001149e9c73985425197104712478c     9534       0      -1         0   
3  00001149e9c73985425197104712478c  

In [14]:
print(train_df['is_odd'].value_counts())
print(test_df['is_odd'].value_counts())

-1    14325447
 0      103720
 1        5733
Name: is_odd, dtype: int64
-1    8671604
 0      60866
 1       2530
Name: is_odd, dtype: int64


#### 各sessionにおいて2回以上出現したyad_noがあれば1のフラグを立てる

In [15]:
# 各セッションにおいて2回以上登場する yad_no を特定
yad_no_counts = train_log_df.groupby(['session_id', 'yad_no']).size().reset_index(name='count')
multiple_visits = yad_no_counts[yad_no_counts['count'] > 1]

# (session_id, yad_no) のタプルをキーとして辞書を作成
multiple_visits_dict = {(row['session_id'], row['yad_no']): 1 for _, row in multiple_visits.iterrows()}

# teacher_df に multiple_visits_flag 列を追加（tqdm で進捗表示）
train_df['multiple_visits_flag'] = [multiple_visits_dict.get((row['session_id'], row['yado_no']), 0) 
                                      for row in tqdm(train_df.to_dict('records'))]

# 結果の確認
print(train_df.head())



# 各セッションにおいて2回以上登場する yad_no を特定
yad_no_counts = test_log_df.groupby(['session_id', 'yad_no']).size().reset_index(name='count')
multiple_visits = yad_no_counts[yad_no_counts['count'] > 1]

# (session_id, yad_no) のタプルをキーとして辞書を作成
multiple_visits_dict = {(row['session_id'], row['yad_no']): 1 for _, row in multiple_visits.iterrows()}

# teacher_df に multiple_visits_flag 列を追加（tqdm で進捗表示）
test_df['multiple_visits_flag'] = [multiple_visits_dict.get((row['session_id'], row['yado_no']), 0) 
                                      for row in tqdm(test_df.to_dict('records'))]

# 結果の確認
print(test_df.head())

100%|██████████| 14434900/14434900 [00:04<00:00, 3163887.77it/s]


                         session_id  yado_no  target  seq_no  seen_yad  \
0  000007603d533d30453cc45d0f3d119f    11882       0      -1         0   
1  000007603d533d30453cc45d0f3d119f     2808       0      -1         0   
2  000007603d533d30453cc45d0f3d119f     4101       1      -1         0   
3  000007603d533d30453cc45d0f3d119f     5289       0      -1         0   
4  000007603d533d30453cc45d0f3d119f     9187       0      -1         0   

   max_seq_no  diff_seq_no  is_odd  multiple_visits_flag  
0           0           -1      -1                     0  
1           0           -1      -1                     0  
2           0           -1      -1                     0  
3           0           -1      -1                     0  
4           0           -1      -1                     0  


100%|██████████| 8735000/8735000 [00:02<00:00, 3271419.81it/s]


                         session_id  yado_no  target  seq_no  seen_yad  \
0  00001149e9c73985425197104712478c     3560       0       0         1   
1  00001149e9c73985425197104712478c     4545       0      -1         0   
2  00001149e9c73985425197104712478c     9534       0      -1         0   
3  00001149e9c73985425197104712478c     6563       0      -1         0   
4  00001149e9c73985425197104712478c     4420       0      -1         0   

   max_seq_no  diff_seq_no  is_odd  multiple_visits_flag  
0           1            1       0                     0  
1           1           -1      -1                     0  
2           1           -1      -1                     0  
3           1           -1      -1                     0  
4           1           -1      -1                     0  


In [16]:
print(train_df['multiple_visits_flag'].value_counts())
print(test_df['multiple_visits_flag'].value_counts())

0    14430252
1        4648
Name: multiple_visits_flag, dtype: int64
0    8732359
1       2641
Name: multiple_visits_flag, dtype: int64


### yado_dfに関する特徴量エンジニアリング

In [17]:
yado_df

Unnamed: 0,yad_no,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd
0,1,0,129.0,1.0,0,1.0,,,1.0,f0112abf369fb03cdc5f5309300913da,072c85e1653e10c9c7dd065ad007125a,449c52ef581d5f9ef311189469a0520e,677a32689cd1ad74e867f1fbe43a3e1c
1,2,0,23.0,1.0,0,,,,,d86102dd9c232bade9a97dccad40df48,b4d2fb4e51ea7bca80eb1270aa474a54,5c9a8f48e9df0234da012747a02d4b29,4ee16ee838dd2703cc9a1d5a535f0ced
2,3,0,167.0,1.0,1,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,572d60f0f5212aacda515ebf81fb0a3a,8a623b960557e87bd1f4edf71b6255be,ab9480fd72a44d51690ab16c4ad4d49c
3,4,0,144.0,1.0,0,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52c9ea83f2cfe92be54cb6bc961edf21,1cc3e1838bb0fd0fde0396130b1f82b9
4,5,0,41.0,1.0,1,,,,,43875109d1dab93592812c50d18270a7,75617bb07a2785a948ab1958909211f1,9ea5a911019b66ccd42f556c42a2fe2f,be1b876af18afc4deeb3081591d2a910
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13801,13802,0,10.0,1.0,1,,,,,c312e07b7a5d456d53a5b00910a336e1,558ac1909f0318b82c621ab250329d6d,80fb3c5ad0c89931d0923e9f80885218,5eb30820716082c720836733d73c605e
13802,13803,0,,,0,1.0,,,1.0,dc414a17890cfc17d011d5038b88ca93,d78f53d0856617bc782f02c3280dfef2,e5cfcc0a43c82072aca11628ff0add53,20ad8785a30f125bee5a8a325782ab06
13803,13804,0,80.0,1.0,1,,1.0,,1.0,d86102dd9c232bade9a97dccad40df48,7d76599bd27ff9e7823b2b1323ca763e,c5fe8848b6ab39b040cdb3668aea9433,b3eab50ccf6ffb51c37d36ee384abfbf
13804,13805,0,8.0,1.0,1,,,,1.0,3300cf6f774b7c6a5807110f244cbc21,689cf8289e7ea0b2eef1b017dcdfe8de,8b712435430a6875839a6c3b5a40b008,2b4165444a777465576b25f65697d739


In [18]:
# train_log_df と test_log_df を結合
log_df = pd.concat([train_log_df, test_log_df], axis=0, ignore_index=True)
# yado_df と結合
merged_df = log_df.merge(yado_df, on='yad_no', how='left')


# 各宿の人気度を計算（例：訪問回数で計算）
yad_popularity = merged_df['yad_no'].value_counts().reset_index()
yad_popularity.columns = ['yad_no', 'popularity']

# 全体での人気度ランキング
yad_popularity['overall_rank'] = yad_popularity['popularity'].rank(ascending=False)
# yado_df に人気度をマージ
yado_df = yado_df.merge(yad_popularity, on='yad_no', how='left')
# エリアごとの人気度ランキングを計算
for area in ['wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd']:
    yado_df[f'{area}_rank'] = yado_df.groupby(area)['popularity'].rank(ascending=False, method='min')

# 結果の確認
print(yado_df.head())

   yad_no  yad_type  total_room_cnt  wireless_lan_flg  onsen_flg  kd_stn_5min  \
0       1         0           129.0               1.0          0          1.0   
1       2         0            23.0               1.0          0          NaN   
2       3         0           167.0               1.0          1          1.0   
3       4         0           144.0               1.0          0          1.0   
4       5         0            41.0               1.0          1          NaN   

   kd_bch_5min  kd_slp_5min  kd_conv_walk_5min  \
0          NaN          NaN                1.0   
1          NaN          NaN                NaN   
2          NaN          NaN                1.0   
3          NaN          NaN                1.0   
4          NaN          NaN                NaN   

                             wid_cd                            ken_cd  \
0  f0112abf369fb03cdc5f5309300913da  072c85e1653e10c9c7dd065ad007125a   
1  d86102dd9c232bade9a97dccad40df48  b4d2fb4e51ea7bca80eb1270aa474

In [19]:
#　train_dfとtest_dfにyado_dfをマージ
train_df = pd.merge(train_df, yado_df.rename(columns={'yad_no': 'yado_no'}), on='yado_no', how='left')
test_df = pd.merge(test_df, yado_df.rename(columns={'yad_no': 'yado_no'}), on='yado_no', how='left')

In [20]:
train_df

Unnamed: 0,session_id,yado_no,target,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag,yad_type,...,wid_cd,ken_cd,lrg_cd,sml_cd,popularity,overall_rank,wid_cd_rank,ken_cd_rank,lrg_cd_rank,sml_cd_rank
0,000007603d533d30453cc45d0f3d119f,11882,0,-1,0,0,-1,-1,0,0,...,dc414a17890cfc17d011d5038b88ca93,d78f53d0856617bc782f02c3280dfef2,4fd631b15116098340cdb099c86a5a40,4044dac1931ddaa5a967e09506d76343,23.0,7189.5,891.0,286.0,25.0,21.0
1,000007603d533d30453cc45d0f3d119f,2808,0,-1,0,0,-1,-1,0,0,...,dc414a17890cfc17d011d5038b88ca93,d78f53d0856617bc782f02c3280dfef2,4fd631b15116098340cdb099c86a5a40,4044dac1931ddaa5a967e09506d76343,36.0,5594.5,687.0,229.0,17.0,17.0
2,000007603d533d30453cc45d0f3d119f,4101,1,-1,0,0,-1,-1,0,0,...,dc414a17890cfc17d011d5038b88ca93,d78f53d0856617bc782f02c3280dfef2,4fd631b15116098340cdb099c86a5a40,4044dac1931ddaa5a967e09506d76343,24.0,7045.0,874.0,280.0,23.0,19.0
3,000007603d533d30453cc45d0f3d119f,5289,0,-1,0,0,-1,-1,0,0,...,dc414a17890cfc17d011d5038b88ca93,d78f53d0856617bc782f02c3280dfef2,4fd631b15116098340cdb099c86a5a40,4044dac1931ddaa5a967e09506d76343,14.0,8690.5,1073.0,334.0,37.0,33.0
4,000007603d533d30453cc45d0f3d119f,9187,0,-1,0,0,-1,-1,0,0,...,dc414a17890cfc17d011d5038b88ca93,6920865be128aa14814810654738b159,828bd0261886a914435f0434dbfc2264,2eac3ef54f291530cfeae907b8823eaf,287.0,162.5,16.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14434895,fffffa7baf370083ebcdd98f26a7e31a,8609,0,-1,0,1,-1,-1,0,0,...,46e33861f921c3e38b81998fbf283f01,572d60f0f5212aacda515ebf81fb0a3a,8a623b960557e87bd1f4edf71b6255be,ab9480fd72a44d51690ab16c4ad4d49c,28.0,6481.5,1316.0,195.0,67.0,17.0
14434896,fffffa7baf370083ebcdd98f26a7e31a,1750,0,-1,0,1,-1,-1,0,0,...,46e33861f921c3e38b81998fbf283f01,572d60f0f5212aacda515ebf81fb0a3a,8a623b960557e87bd1f4edf71b6255be,ab9480fd72a44d51690ab16c4ad4d49c,24.0,7045.0,1407.0,213.0,69.0,18.0
14434897,fffffa7baf370083ebcdd98f26a7e31a,6247,0,-1,0,1,-1,-1,0,0,...,46e33861f921c3e38b81998fbf283f01,572d60f0f5212aacda515ebf81fb0a3a,8a623b960557e87bd1f4edf71b6255be,ab9480fd72a44d51690ab16c4ad4d49c,20.0,7630.5,1506.0,225.0,71.0,20.0
14434898,fffffa7baf370083ebcdd98f26a7e31a,12350,0,-1,0,1,-1,-1,0,0,...,46e33861f921c3e38b81998fbf283f01,572d60f0f5212aacda515ebf81fb0a3a,8a623b960557e87bd1f4edf71b6255be,1d9f09b9e2bd43cebc9885a46388739a,1606.0,1.0,1.0,1.0,1.0,1.0


#### 各種Encoding

In [21]:
# カテゴリ変数と数値変数の明確化
cat_cols = ['wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd']

# 欠損値を-1で埋める
train_df.fillna(-1, inplace=True)
test_df.fillna(-1, inplace=True)

In [22]:
# Label Encoding

# testにしか存在しないものはNullにするので、trainのみをencoderにする
for col in cat_cols:
    encoder = {c: i for i, c in enumerate(train_df[col].unique())}
    train_df[f'label_{col}'] = train_df[col].map(encoder)
    test_df[f'label_{col}'] = test_df[col].map(encoder)

In [23]:
# Count Encoding（trainとtestをマージした方が良い？）←　した
# 増やした方が良い？

cat_cols = ['wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd'] + ['yado_no']

# testにしか存在しないものはNullにするので、trainのみをencoderにする
for col in cat_cols:
    encoder = train_df[col].value_counts()
    train_df[f'count_{col}'] = train_df[col].map(encoder)
    test_df[f'count_{col}'] = test_df[col].map(encoder)

In [24]:
# Target encoding
# 元の列順が保持されないから

cat_cols = ['wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd'] + ['yado_no']

# GroupKFold の設定  StratifiedGrouoKFoldでもいいかもしれない
gkf = GroupKFold(n_splits=5)

# fold 列を初期化
train_df['fold'] = -1

# 各 Fold に対してインデックスを割り当て
for fold, (train_idx, val_idx) in enumerate(gkf.split(train_df, groups=train_df['session_id'])):
    # バリデーションセットのインデックスに Fold 番号を割り当て
    train_df.loc[val_idx, 'fold'] = fold
# 結果の確認
print(train_df.head())



# TargetEncodingが完了した検証用データセットを格納するリスト
encoded_dfs = []

# すべての分割についてのループ
for fold in range(5):

  # 学習用と検証用データセットに分割する
  df_train = train_df[train_df.fold != fold].reset_index(drop=True)
  df_valid = train_df[train_df.fold == fold].reset_index(drop=True)

  # すべてのカテゴリについてのループ
  for column in cat_cols:
    # カテゴリごとの目的変数の平均についての辞書を作成
    mapping_dict = dict(
        df_train.groupby(column)['target'].mean()
    )
    # 元の列名の末尾に'enc'を加えた名前で、新しい列を作成
    df_valid.loc[
        :, 'TE_' + column
    ] = df_valid[column].map(mapping_dict)

  # リストに格納
  encoded_dfs.append(df_valid)

# 結合したデータセットを返す
encoded_df = pd.concat(encoded_dfs, axis=0)


                         session_id  yado_no  target  seq_no  seen_yad  \
0  000007603d533d30453cc45d0f3d119f    11882       0      -1         0   
1  000007603d533d30453cc45d0f3d119f     2808       0      -1         0   
2  000007603d533d30453cc45d0f3d119f     4101       1      -1         0   
3  000007603d533d30453cc45d0f3d119f     5289       0      -1         0   
4  000007603d533d30453cc45d0f3d119f     9187       0      -1         0   

   max_seq_no  diff_seq_no  is_odd  multiple_visits_flag  yad_type  ...  \
0           0           -1      -1                     0         0  ...   
1           0           -1      -1                     0         0  ...   
2           0           -1      -1                     0         0  ...   
3           0           -1      -1                     0         0  ...   
4           0           -1      -1                     0         0  ...   

   label_wid_cd  label_ken_cd  label_lrg_cd  label_sml_cd  count_wid_cd  \
0             0             0

In [25]:
train_df

Unnamed: 0,session_id,yado_no,target,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag,yad_type,...,label_wid_cd,label_ken_cd,label_lrg_cd,label_sml_cd,count_wid_cd,count_ken_cd,count_lrg_cd,count_sml_cd,count_yado_no,fold
0,000007603d533d30453cc45d0f3d119f,11882,0,-1,0,0,-1,-1,0,0,...,0,0,0,0,1684181,586244,18280,17620,364,2
1,000007603d533d30453cc45d0f3d119f,2808,0,-1,0,0,-1,-1,0,0,...,0,0,0,0,1684181,586244,18280,17620,516,2
2,000007603d533d30453cc45d0f3d119f,4101,1,-1,0,0,-1,-1,0,0,...,0,0,0,0,1684181,586244,18280,17620,476,2
3,000007603d533d30453cc45d0f3d119f,5289,0,-1,0,0,-1,-1,0,0,...,0,0,0,0,1684181,586244,18280,17620,266,2
4,000007603d533d30453cc45d0f3d119f,9187,0,-1,0,0,-1,-1,0,0,...,0,1,1,1,1684181,700612,234992,227240,166283,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14434895,fffffa7baf370083ebcdd98f26a7e31a,8609,0,-1,0,1,-1,-1,0,0,...,8,15,20,97,3445022,1044764,893457,20202,662,0
14434896,fffffa7baf370083ebcdd98f26a7e31a,1750,0,-1,0,1,-1,-1,0,0,...,8,15,20,97,3445022,1044764,893457,20202,636,0
14434897,fffffa7baf370083ebcdd98f26a7e31a,6247,0,-1,0,1,-1,-1,0,0,...,8,15,20,97,3445022,1044764,893457,20202,702,0
14434898,fffffa7baf370083ebcdd98f26a7e31a,12350,0,-1,0,1,-1,-1,0,0,...,8,15,20,20,3445022,1044764,893457,769979,96261,0


In [26]:
train_df = encoded_df

In [27]:
# Testデータに対しても同様にTargetEncodingを行う
cat_cols = ['wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd'] + ['yado_no']

# すべてのカテゴリについてのループ
# train_df全体で計算する
for column in cat_cols:
    # カテゴリごとの目的変数の平均についての辞書を作成
    mapping_dict = dict(
        train_df.groupby(column)['target'].mean()
    )
    
    # 元の列名の末尾に'enc'を加えた名前で、新しい列を作成
    test_df.loc[
        :, 'TE_' + column
    ] = test_df[column].map(mapping_dict)

In [28]:
test_df

Unnamed: 0,session_id,yado_no,target,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag,yad_type,...,count_wid_cd,count_ken_cd,count_lrg_cd,count_sml_cd,count_yado_no,TE_wid_cd,TE_ken_cd,TE_lrg_cd,TE_sml_cd,TE_yado_no
0,00001149e9c73985425197104712478c,3560,0,0,1,1,1,0,0,0,...,3445022,1573044,74664,35247,1018.0,0.015367,0.017512,0.022568,0.020654,0.034381
1,00001149e9c73985425197104712478c,4545,0,-1,0,1,-1,-1,0,0,...,3445022,1573044,74664,35247,1048.0,0.015367,0.017512,0.022568,0.020654,0.037214
2,00001149e9c73985425197104712478c,9534,0,-1,0,1,-1,-1,0,0,...,3445022,1573044,74664,35247,1721.0,0.015367,0.017512,0.022568,0.020654,0.028472
3,00001149e9c73985425197104712478c,6563,0,-1,0,1,-1,-1,0,0,...,3445022,1573044,74664,35247,1037.0,0.015367,0.017512,0.022568,0.020654,0.023144
4,00001149e9c73985425197104712478c,4420,0,-1,0,1,-1,-1,0,0,...,3445022,1573044,74664,35247,988.0,0.015367,0.017512,0.022568,0.020654,0.032389
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8734995,ffffe984aafd6127ce8e43e3ca40c79d,2527,0,-1,0,0,-1,-1,0,0,...,3445022,1573044,130022,49862,521.0,0.015367,0.017512,0.018912,0.017769,0.011516
8734996,ffffe984aafd6127ce8e43e3ca40c79d,5946,0,-1,0,0,-1,-1,0,0,...,3445022,1573044,130022,49862,462.0,0.015367,0.017512,0.018912,0.017769,0.015152
8734997,ffffe984aafd6127ce8e43e3ca40c79d,9792,0,-1,0,0,-1,-1,0,0,...,3445022,1573044,130022,49862,486.0,0.015367,0.017512,0.018912,0.017769,0.014403
8734998,ffffe984aafd6127ce8e43e3ca40c79d,6858,0,-1,0,0,-1,-1,0,0,...,3445022,1573044,130022,49862,335.0,0.015367,0.017512,0.018912,0.017769,0.023881


In [29]:
all_features = train_df.columns.to_list()
features= [x for x in all_features if x not in ('session_id', 'fold', 'target', 'wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd')]

In [30]:
train_df[features]

Unnamed: 0,yado_no,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag,yad_type,total_room_cnt,wireless_lan_flg,...,count_wid_cd,count_ken_cd,count_lrg_cd,count_sml_cd,count_yado_no,TE_wid_cd,TE_ken_cd,TE_lrg_cd,TE_sml_cd,TE_yado_no
0,96,0,1,1,1,0,0,0,228.0,1.0,...,1725156,136135,52732,51851,1379,0.016339,0.025044,0.026324,0.025747,0.025022
1,902,-1,0,1,-1,-1,0,0,240.0,1.0,...,1725156,136135,52732,51851,1336,0.016339,0.025044,0.026324,0.025747,0.039778
2,12491,-1,0,1,-1,-1,0,0,144.0,1.0,...,1725156,136135,52732,51851,1339,0.016339,0.025044,0.026324,0.025747,0.047794
3,5490,-1,0,1,-1,-1,0,0,116.0,1.0,...,1725156,136135,52732,51851,1349,0.016339,0.025044,0.026324,0.025747,0.040073
4,1284,-1,0,1,-1,-1,0,0,176.0,1.0,...,1725156,136135,52732,51851,1348,0.016339,0.025044,0.026324,0.025747,0.040073
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2886945,9368,-1,0,1,-1,-1,0,0,24.0,1.0,...,328406,37724,36204,31431,801,0.024851,0.030654,0.028250,0.027672,0.012422
2886946,3046,-1,0,1,-1,-1,0,0,22.0,1.0,...,328406,37724,36204,31431,785,0.024851,0.030654,0.028250,0.027672,0.007974
2886947,4732,-1,0,1,-1,-1,0,0,33.0,1.0,...,328406,37724,36204,31431,736,0.024851,0.030654,0.028250,0.027672,0.018739
2886948,6214,-1,0,1,-1,-1,0,0,35.0,1.0,...,328406,37724,36204,31431,712,0.024851,0.030654,0.028250,0.027672,0.000000


In [31]:
test_df[features]

Unnamed: 0,yado_no,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag,yad_type,total_room_cnt,wireless_lan_flg,...,count_wid_cd,count_ken_cd,count_lrg_cd,count_sml_cd,count_yado_no,TE_wid_cd,TE_ken_cd,TE_lrg_cd,TE_sml_cd,TE_yado_no
0,3560,0,1,1,1,0,0,0,205.0,1.0,...,3445022,1573044,74664,35247,1018.0,0.015367,0.017512,0.022568,0.020654,0.034381
1,4545,-1,0,1,-1,-1,0,0,186.0,-1.0,...,3445022,1573044,74664,35247,1048.0,0.015367,0.017512,0.022568,0.020654,0.037214
2,9534,-1,0,1,-1,-1,0,0,136.0,1.0,...,3445022,1573044,74664,35247,1721.0,0.015367,0.017512,0.022568,0.020654,0.028472
3,6563,-1,0,1,-1,-1,0,0,408.0,1.0,...,3445022,1573044,74664,35247,1037.0,0.015367,0.017512,0.022568,0.020654,0.023144
4,4420,-1,0,1,-1,-1,0,0,124.0,1.0,...,3445022,1573044,74664,35247,988.0,0.015367,0.017512,0.022568,0.020654,0.032389
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8734995,2527,-1,0,0,-1,-1,0,0,109.0,1.0,...,3445022,1573044,130022,49862,521.0,0.015367,0.017512,0.018912,0.017769,0.011516
8734996,5946,-1,0,0,-1,-1,0,0,96.0,1.0,...,3445022,1573044,130022,49862,462.0,0.015367,0.017512,0.018912,0.017769,0.015152
8734997,9792,-1,0,0,-1,-1,0,0,99.0,1.0,...,3445022,1573044,130022,49862,486.0,0.015367,0.017512,0.018912,0.017769,0.014403
8734998,6858,-1,0,0,-1,-1,0,0,46.0,1.0,...,3445022,1573044,130022,49862,335.0,0.015367,0.017512,0.018912,0.017769,0.023881


In [32]:
train_df.columns.to_list()

['session_id',
 'yado_no',
 'target',
 'seq_no',
 'seen_yad',
 'max_seq_no',
 'diff_seq_no',
 'is_odd',
 'multiple_visits_flag',
 'yad_type',
 'total_room_cnt',
 'wireless_lan_flg',
 'onsen_flg',
 'kd_stn_5min',
 'kd_bch_5min',
 'kd_slp_5min',
 'kd_conv_walk_5min',
 'wid_cd',
 'ken_cd',
 'lrg_cd',
 'sml_cd',
 'popularity',
 'overall_rank',
 'wid_cd_rank',
 'ken_cd_rank',
 'lrg_cd_rank',
 'sml_cd_rank',
 'label_wid_cd',
 'label_ken_cd',
 'label_lrg_cd',
 'label_sml_cd',
 'count_wid_cd',
 'count_ken_cd',
 'count_lrg_cd',
 'count_sml_cd',
 'count_yado_no',
 'fold',
 'TE_wid_cd',
 'TE_ken_cd',
 'TE_lrg_cd',
 'TE_sml_cd',
 'TE_yado_no']

In [33]:
import gc
gc.collect()

189

In [34]:
test_df

Unnamed: 0,session_id,yado_no,target,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag,yad_type,...,count_wid_cd,count_ken_cd,count_lrg_cd,count_sml_cd,count_yado_no,TE_wid_cd,TE_ken_cd,TE_lrg_cd,TE_sml_cd,TE_yado_no
0,00001149e9c73985425197104712478c,3560,0,0,1,1,1,0,0,0,...,3445022,1573044,74664,35247,1018.0,0.015367,0.017512,0.022568,0.020654,0.034381
1,00001149e9c73985425197104712478c,4545,0,-1,0,1,-1,-1,0,0,...,3445022,1573044,74664,35247,1048.0,0.015367,0.017512,0.022568,0.020654,0.037214
2,00001149e9c73985425197104712478c,9534,0,-1,0,1,-1,-1,0,0,...,3445022,1573044,74664,35247,1721.0,0.015367,0.017512,0.022568,0.020654,0.028472
3,00001149e9c73985425197104712478c,6563,0,-1,0,1,-1,-1,0,0,...,3445022,1573044,74664,35247,1037.0,0.015367,0.017512,0.022568,0.020654,0.023144
4,00001149e9c73985425197104712478c,4420,0,-1,0,1,-1,-1,0,0,...,3445022,1573044,74664,35247,988.0,0.015367,0.017512,0.022568,0.020654,0.032389
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8734995,ffffe984aafd6127ce8e43e3ca40c79d,2527,0,-1,0,0,-1,-1,0,0,...,3445022,1573044,130022,49862,521.0,0.015367,0.017512,0.018912,0.017769,0.011516
8734996,ffffe984aafd6127ce8e43e3ca40c79d,5946,0,-1,0,0,-1,-1,0,0,...,3445022,1573044,130022,49862,462.0,0.015367,0.017512,0.018912,0.017769,0.015152
8734997,ffffe984aafd6127ce8e43e3ca40c79d,9792,0,-1,0,0,-1,-1,0,0,...,3445022,1573044,130022,49862,486.0,0.015367,0.017512,0.018912,0.017769,0.014403
8734998,ffffe984aafd6127ce8e43e3ca40c79d,6858,0,-1,0,0,-1,-1,0,0,...,3445022,1573044,130022,49862,335.0,0.015367,0.017512,0.018912,0.017769,0.023881


In [35]:
train_df

Unnamed: 0,session_id,yado_no,target,seq_no,seen_yad,max_seq_no,diff_seq_no,is_odd,multiple_visits_flag,yad_type,...,count_ken_cd,count_lrg_cd,count_sml_cd,count_yado_no,fold,TE_wid_cd,TE_ken_cd,TE_lrg_cd,TE_sml_cd,TE_yado_no
0,000104bdffaaad1a1e0a9ebacf585f33,96,1,0,1,1,1,0,0,0,...,136135,52732,51851,1379,0,0.016339,0.025044,0.026324,0.025747,0.025022
1,000104bdffaaad1a1e0a9ebacf585f33,902,0,-1,0,1,-1,-1,0,0,...,136135,52732,51851,1336,0,0.016339,0.025044,0.026324,0.025747,0.039778
2,000104bdffaaad1a1e0a9ebacf585f33,12491,0,-1,0,1,-1,-1,0,0,...,136135,52732,51851,1339,0,0.016339,0.025044,0.026324,0.025747,0.047794
3,000104bdffaaad1a1e0a9ebacf585f33,5490,0,-1,0,1,-1,-1,0,0,...,136135,52732,51851,1349,0,0.016339,0.025044,0.026324,0.025747,0.040073
4,000104bdffaaad1a1e0a9ebacf585f33,1284,0,-1,0,1,-1,-1,0,0,...,136135,52732,51851,1348,0,0.016339,0.025044,0.026324,0.025747,0.040073
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2886945,fffe8c99c5b332190c3d4a2d6e7c5073,9368,0,-1,0,1,-1,-1,0,0,...,37724,36204,31431,801,4,0.024851,0.030654,0.028250,0.027672,0.012422
2886946,fffe8c99c5b332190c3d4a2d6e7c5073,3046,0,-1,0,1,-1,-1,0,0,...,37724,36204,31431,785,4,0.024851,0.030654,0.028250,0.027672,0.007974
2886947,fffe8c99c5b332190c3d4a2d6e7c5073,4732,0,-1,0,1,-1,-1,0,0,...,37724,36204,31431,736,4,0.024851,0.030654,0.028250,0.027672,0.018739
2886948,fffe8c99c5b332190c3d4a2d6e7c5073,6214,0,-1,0,1,-1,-1,0,0,...,37724,36204,31431,712,4,0.024851,0.030654,0.028250,0.027672,0.000000


In [36]:
# train_df.to_csv('../feature_engineering_v3_train_df.csv', index=False)
# test_df.to_csv('../feature_engineering_v3_test_df.csv', index=False)

In [37]:
train_df.to_parquet('../data/feature_engineering_v3_train_df.parquet', index=False)
test_df.to_parquet('../data/feature_engineering_v3_test_df.parquet', index=False)