In [25]:
%pip install nyaggle

Collecting nyaggle
  Downloading nyaggle-0.1.6-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.3/53.3 KB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
Installing collected packages: nyaggle
Successfully installed nyaggle-0.1.6
[0mNote: you may need to restart the kernel to use updated packages.


In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm


from sklearn.model_selection import KFold
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import StratifiedKFold
from nyaggle.feature.category_encoder import TargetEncoder

from nyaggle.experiment import run_experiment

In [2]:
train_log_df = pd.read_csv('../train_log.csv')
test_log_df = pd.read_csv('../test_log.csv')
train_label_df = pd.read_csv('../train_label.csv')
test_session_df = pd.read_csv('../test_session.csv')
yado_df = pd.read_csv('../yado.csv')
# image_embeddings_df = pd.read_parquet('../image_embeddings.parquet')
sample_submission_df = pd.read_csv('../sample_submission.csv')

candidate_train_df = pd.read_csv('../data/candidate_ver7_train.csv')
candidate_test_df = pd.read_csv('../data/candidate_ver7_test.csv')

In [3]:
train_log_df

Unnamed: 0,session_id,seq_no,yad_no
0,000007603d533d30453cc45d0f3d119f,0,2395
1,0000ca043ed437a1472c9d1d154eb49b,0,13535
2,0000d4835cf113316fe447e2f80ba1c8,0,123
3,0000fcda1ae1b2f431e55a7075d1f500,0,8475
4,000104bdffaaad1a1e0a9ebacf585f33,0,96
...,...,...,...
419265,ffffcd5bc19d62cad5a3815c87818d83,0,12230
419266,ffffcd5bc19d62cad5a3815c87818d83,1,10619
419267,ffffcd5bc19d62cad5a3815c87818d83,2,12230
419268,fffffa7baf370083ebcdd98f26a7e31a,0,2439


In [4]:
test_session_df

Unnamed: 0,session_id
0,00001149e9c73985425197104712478c
1,0000e02747d749a52b7736dfa751e258
2,0000f17ae2628237d78d3a38b009d3be
3,000174a6f7a569b84c5575760d2e9664
4,00017e2a527901c9c41b1acef525d016
...,...
174695,fffee3199ef94b92283239cd5e3534fa
174696,ffff62c6bb49bc9c0fbcf08494a4869c
174697,ffff9a7dcc892875c7a8b821fa436228
174698,ffffb1d30300fe17f661941fd085b04b


In [5]:
# 学習用データ
# 教師データに正例と負例のフラグを付与
train_target = []

# 各セッションに対して候補の宿をチェック
for index, row in tqdm(train_label_df.iterrows(), total=train_label_df.shape[0]):
    session_id = row['session_id']
    yad_no = row['yad_no']
    
    # candidate_ver7_train.csv の該当行を取得（同じインデックスを仮定）
    candidate_row = candidate_train_df.iloc[index]

    # 50個の購入候補の宿に対してチェック
    for col in candidate_train_df.columns:
        yado_no = candidate_row[col]
        is_correct = (yado_no == yad_no)  # 一致するかどうかのチェック
        train_target.append([session_id, yado_no, is_correct])

# 結果をDataFrameに変換
train_df = pd.DataFrame(train_target, columns=['session_id', 'yado_no', 'target']) #区別するためにあえてyad_noではなくyado_noとしている
train_df['target'] = train_df['target'].astype(int)  # target列をint型に変換

# 最初の数行を表示して確認
print(train_df.head())

print('正例と負例の数を確認')
print(train_df['target'].value_counts())


100%|██████████| 288698/288698 [01:22<00:00, 3515.22it/s]


                         session_id  yado_no  target
0  000007603d533d30453cc45d0f3d119f    11882       0
1  000007603d533d30453cc45d0f3d119f     2808       0
2  000007603d533d30453cc45d0f3d119f     4101       1
3  000007603d533d30453cc45d0f3d119f     5289       0
4  000007603d533d30453cc45d0f3d119f     9187       0
正例と負例の数を確認
0    14200708
1      234192
Name: target, dtype: int64


In [17]:
train_df.head()

Unnamed: 0,session_id,yado_no,target
0,000007603d533d30453cc45d0f3d119f,11882,0
1,000007603d533d30453cc45d0f3d119f,2808,0
2,000007603d533d30453cc45d0f3d119f,4101,1
3,000007603d533d30453cc45d0f3d119f,5289,0
4,000007603d533d30453cc45d0f3d119f,9187,0


In [6]:
# 推論用データにダミーのyado_noを付与
test_session_df['yad_no'] = -1

test_target = []

# 各セッションに対して候補の宿をチェック
for index, row in tqdm(test_session_df.iterrows(), total=test_session_df.shape[0]):
    session_id = row['session_id']
    yad_no = row['yad_no']
    
    # candidate_ver7_train.csv の該当行を取得（同じインデックスを仮定）
    candidate_row = candidate_train_df.iloc[index]

    # 50個の購入候補の宿に対してチェック
    for col in candidate_train_df.columns:
        yado_no = candidate_row[col]
        is_correct = (yado_no == yad_no)  # 一致するかどうかのチェック
        test_target.append([session_id, yado_no, is_correct])

# 結果をDataFrameに変換
test_df = pd.DataFrame(test_target, columns=['session_id', 'yado_no', 'target']) #区別するためにあえてyad_noではなくyado_noとしている
test_df['target'] = test_df['target'].astype(int)  # target列をint型に変換

# 最初の数行を表示して確認
print(test_df.head())

print('正例と負例の数を確認')
print(test_df['target'].value_counts()) # すべて0になるはず

100%|██████████| 174700/174700 [00:49<00:00, 3541.31it/s]


                         session_id  yado_no  target
0  00001149e9c73985425197104712478c    11882       0
1  00001149e9c73985425197104712478c     2808       0
2  00001149e9c73985425197104712478c     4101       0
3  00001149e9c73985425197104712478c     5289       0
4  00001149e9c73985425197104712478c     9187       0
正例と負例の数を確認
0    8735000
Name: target, dtype: int64


In [7]:
# yado_dfをtrainとtestに結合する
merged_train_log_df = pd.merge(train_log_df, yado_df, on='yad_no', how='left')
merged_test_log_df = pd.merge(test_log_df, yado_df, on='yad_no', how='left')

In [8]:
merged_train_log_df

Unnamed: 0,session_id,seq_no,yad_no,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd
0,000007603d533d30453cc45d0f3d119f,0,2395,0,113.0,1.0,0,,,,,dc414a17890cfc17d011d5038b88ca93,d78f53d0856617bc782f02c3280dfef2,4fd631b15116098340cdb099c86a5a40,4044dac1931ddaa5a967e09506d76343
1,0000ca043ed437a1472c9d1d154eb49b,0,13535,0,40.0,1.0,0,1.0,,,1.0,b07b75d367ebece55a23ceecc939fff4,0a66f6ab9c0507059da6f22a0e1f1690,9ab5718fd88c6e5f9fec37a51827d428,7aff71bb47acb796d425c5ed5e6dfb3f
2,0000d4835cf113316fe447e2f80ba1c8,0,123,0,17.0,1.0,0,,,,,46e33861f921c3e38b81998fbf283f01,572d60f0f5212aacda515ebf81fb0a3a,dac434451fe9bd50068191f41fe792e3,b7c56c5d2855b39366b4ebe9a4eded93
3,0000fcda1ae1b2f431e55a7075d1f500,0,8475,0,65.0,1.0,0,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,3a6cd37aa9e38fd96d9dafc2615643d0,f2fcbd8e62872147efde0acef474e1f2
4,000104bdffaaad1a1e0a9ebacf585f33,0,96,0,228.0,1.0,0,,,,1.0,e9316013ee1b03f4525fe361c46ce9c5,84efa50e52f9b471c95bfc3b21b854ad,a1370d90ed3b80ee41311bbbab46aec9,d72674f02c5340d90f245e3177727650
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
419265,ffffcd5bc19d62cad5a3815c87818d83,0,12230,0,354.0,1.0,0,,,,1.0,321b69d5eec98fe6253e26b86058e6a9,a2b54b288d51bb19085ed1d99c428397,0c92ce61d0bf83edefee7eea279a15c8,de9c306d6999d60160eaf17cdb20fe47
419266,ffffcd5bc19d62cad5a3815c87818d83,1,10619,0,,1.0,0,,,,1.0,321b69d5eec98fe6253e26b86058e6a9,a2b54b288d51bb19085ed1d99c428397,0c92ce61d0bf83edefee7eea279a15c8,de9c306d6999d60160eaf17cdb20fe47
419267,ffffcd5bc19d62cad5a3815c87818d83,2,12230,0,354.0,1.0,0,,,,1.0,321b69d5eec98fe6253e26b86058e6a9,a2b54b288d51bb19085ed1d99c428397,0c92ce61d0bf83edefee7eea279a15c8,de9c306d6999d60160eaf17cdb20fe47
419268,fffffa7baf370083ebcdd98f26a7e31a,0,2439,0,81.0,1.0,0,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,572d60f0f5212aacda515ebf81fb0a3a,8a623b960557e87bd1f4edf71b6255be,ab9480fd72a44d51690ab16c4ad4d49c


In [9]:
# カテゴリ変数と数値変数の明確化
cat_cols = ['wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd']
# num_cols =

# 欠損値を-1で埋める
merged_train_log_df.fillna(-1, inplace=True)
merged_test_log_df.fillna(-1, inplace=True)

In [10]:
# Label Encoding

# testにしか存在しないものはNullにするので、trainのみをencoderにする
for col in cat_cols:
    encoder = {c: i for i, c in enumerate(merged_train_log_df[col].unique())}
    merged_train_log_df[f'label_{col}'] = merged_train_log_df[col].map(encoder)
    merged_test_log_df[f'label_{col}'] = merged_test_log_df[col].map(encoder)

In [11]:
merged_train_log_df

Unnamed: 0,session_id,seq_no,yad_no,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd,label_wid_cd,label_ken_cd,label_lrg_cd,label_sml_cd
0,000007603d533d30453cc45d0f3d119f,0,2395,0,113.0,1.0,0,-1.0,-1.0,-1.0,-1.0,dc414a17890cfc17d011d5038b88ca93,d78f53d0856617bc782f02c3280dfef2,4fd631b15116098340cdb099c86a5a40,4044dac1931ddaa5a967e09506d76343,0,0,0,0
1,0000ca043ed437a1472c9d1d154eb49b,0,13535,0,40.0,1.0,0,1.0,-1.0,-1.0,1.0,b07b75d367ebece55a23ceecc939fff4,0a66f6ab9c0507059da6f22a0e1f1690,9ab5718fd88c6e5f9fec37a51827d428,7aff71bb47acb796d425c5ed5e6dfb3f,1,1,1,1
2,0000d4835cf113316fe447e2f80ba1c8,0,123,0,17.0,1.0,0,-1.0,-1.0,-1.0,-1.0,46e33861f921c3e38b81998fbf283f01,572d60f0f5212aacda515ebf81fb0a3a,dac434451fe9bd50068191f41fe792e3,b7c56c5d2855b39366b4ebe9a4eded93,2,2,2,2
3,0000fcda1ae1b2f431e55a7075d1f500,0,8475,0,65.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,3a6cd37aa9e38fd96d9dafc2615643d0,f2fcbd8e62872147efde0acef474e1f2,2,3,3,3
4,000104bdffaaad1a1e0a9ebacf585f33,0,96,0,228.0,1.0,0,-1.0,-1.0,-1.0,1.0,e9316013ee1b03f4525fe361c46ce9c5,84efa50e52f9b471c95bfc3b21b854ad,a1370d90ed3b80ee41311bbbab46aec9,d72674f02c5340d90f245e3177727650,3,4,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
419265,ffffcd5bc19d62cad5a3815c87818d83,0,12230,0,354.0,1.0,0,-1.0,-1.0,-1.0,1.0,321b69d5eec98fe6253e26b86058e6a9,a2b54b288d51bb19085ed1d99c428397,0c92ce61d0bf83edefee7eea279a15c8,de9c306d6999d60160eaf17cdb20fe47,6,18,26,29
419266,ffffcd5bc19d62cad5a3815c87818d83,1,10619,0,-1.0,1.0,0,-1.0,-1.0,-1.0,1.0,321b69d5eec98fe6253e26b86058e6a9,a2b54b288d51bb19085ed1d99c428397,0c92ce61d0bf83edefee7eea279a15c8,de9c306d6999d60160eaf17cdb20fe47,6,18,26,29
419267,ffffcd5bc19d62cad5a3815c87818d83,2,12230,0,354.0,1.0,0,-1.0,-1.0,-1.0,1.0,321b69d5eec98fe6253e26b86058e6a9,a2b54b288d51bb19085ed1d99c428397,0c92ce61d0bf83edefee7eea279a15c8,de9c306d6999d60160eaf17cdb20fe47,6,18,26,29
419268,fffffa7baf370083ebcdd98f26a7e31a,0,2439,0,81.0,1.0,0,1.0,-1.0,-1.0,1.0,46e33861f921c3e38b81998fbf283f01,572d60f0f5212aacda515ebf81fb0a3a,8a623b960557e87bd1f4edf71b6255be,ab9480fd72a44d51690ab16c4ad4d49c,2,2,13,44


In [16]:
# trainとtestを結合
# merged_log_df = pd.concat([merged_train_log_df, merged_test_log_df], axis=0).reset_index(drop=True)
# merged_log_df

In [12]:
# Count Encoding（trainとtestをマージした方が良い？）←　した
# 増やした方が良い？

# testにしか存在しないものはNullにするので、trainのみをencoderにする
for col in cat_cols:
    encoder = merged_train_log_df[col].value_counts()
    merged_train_log_df[f'count_{col}'] = merged_train_log_df[col].map(encoder)
    merged_test_log_df[f'count_{col}'] = merged_test_log_df[col].map(encoder)

In [13]:
merged_train_log_df

Unnamed: 0,session_id,seq_no,yad_no,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,...,lrg_cd,sml_cd,label_wid_cd,label_ken_cd,label_lrg_cd,label_sml_cd,count_wid_cd,count_ken_cd,count_lrg_cd,count_sml_cd
0,000007603d533d30453cc45d0f3d119f,0,2395,0,113.0,1.0,0,-1.0,-1.0,-1.0,...,4fd631b15116098340cdb099c86a5a40,4044dac1931ddaa5a967e09506d76343,0,0,0,0,46917,19295,1067,973
1,0000ca043ed437a1472c9d1d154eb49b,0,13535,0,40.0,1.0,0,1.0,-1.0,-1.0,...,9ab5718fd88c6e5f9fec37a51827d428,7aff71bb47acb796d425c5ed5e6dfb3f,1,1,1,1,31684,31684,11758,9909
2,0000d4835cf113316fe447e2f80ba1c8,0,123,0,17.0,1.0,0,-1.0,-1.0,-1.0,...,dac434451fe9bd50068191f41fe792e3,b7c56c5d2855b39366b4ebe9a4eded93,2,2,2,2,103964,22062,656,500
3,0000fcda1ae1b2f431e55a7075d1f500,0,8475,0,65.0,1.0,0,1.0,-1.0,-1.0,...,3a6cd37aa9e38fd96d9dafc2615643d0,f2fcbd8e62872147efde0acef474e1f2,2,3,3,3,103964,54159,8485,2028
4,000104bdffaaad1a1e0a9ebacf585f33,0,96,0,228.0,1.0,0,-1.0,-1.0,-1.0,...,a1370d90ed3b80ee41311bbbab46aec9,d72674f02c5340d90f245e3177727650,3,4,4,4,49133,5470,2146,2045
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
419265,ffffcd5bc19d62cad5a3815c87818d83,0,12230,0,354.0,1.0,0,-1.0,-1.0,-1.0,...,0c92ce61d0bf83edefee7eea279a15c8,de9c306d6999d60160eaf17cdb20fe47,6,18,26,29,11761,2559,1332,1281
419266,ffffcd5bc19d62cad5a3815c87818d83,1,10619,0,-1.0,1.0,0,-1.0,-1.0,-1.0,...,0c92ce61d0bf83edefee7eea279a15c8,de9c306d6999d60160eaf17cdb20fe47,6,18,26,29,11761,2559,1332,1281
419267,ffffcd5bc19d62cad5a3815c87818d83,2,12230,0,354.0,1.0,0,-1.0,-1.0,-1.0,...,0c92ce61d0bf83edefee7eea279a15c8,de9c306d6999d60160eaf17cdb20fe47,6,18,26,29,11761,2559,1332,1281
419268,fffffa7baf370083ebcdd98f26a7e31a,0,2439,0,81.0,1.0,0,1.0,-1.0,-1.0,...,8a623b960557e87bd1f4edf71b6255be,ab9480fd72a44d51690ab16c4ad4d49c,2,2,13,44,103964,22062,14908,1002


In [15]:
#　一旦パス
"""
# 集約特徴量
# 集約するような数値関数あるのか？

agg_cols = ['min', 'max', 'mean', 'std']
cat_cols = ['wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd']
num_cols = []

# testにしか存在しないものはNullにするので、trainのみで集計する
for col in cat_cols + ['yad_no']:
    grp_df = merged_train_log_df.groupby(col)[num_cols].agg(agg_cols)
    grp_df.columns = [f'{col}_' + '_'.join(c) for c in grp_df.columns]
    merged_train_log_df = merged_train_log_df.merge(grp_df, on=col, how='left')
    merged_test_log_df = merged_test_log_df.merge(grp_df, on=col, how='left')
"""

NameError: name 'num_cols' is not defined

In [None]:
# sessionに関する特徴量
# sessionに現れたか否か？


（# もしsessionに現れていた場合、何番目の宿か？（複数あるから注意））

In [None]:
#　MultiHotEncoding
# session中に閲覧した宿の数（これはMultiHotEncodingを横軸で足し算）

In [48]:
train_log_df

Unnamed: 0,session_id,seq_no,yad_no
0,000007603d533d30453cc45d0f3d119f,0,2395
1,0000ca043ed437a1472c9d1d154eb49b,0,13535
2,0000d4835cf113316fe447e2f80ba1c8,0,123
3,0000fcda1ae1b2f431e55a7075d1f500,0,8475
4,000104bdffaaad1a1e0a9ebacf585f33,0,96
...,...,...,...
419265,ffffcd5bc19d62cad5a3815c87818d83,0,12230
419266,ffffcd5bc19d62cad5a3815c87818d83,1,10619
419267,ffffcd5bc19d62cad5a3815c87818d83,2,12230
419268,fffffa7baf370083ebcdd98f26a7e31a,0,2439


In [None]:
# Foldが絡むので一番最後に実行する

# Target Encoding
cat_cols = ['wid_cd', 'ken_cd', 'lrg_cd', 'sml_cd']

# kf = KFold(4)
kf = KFold(5, groups=train.categorical) #GroupKFold ???
#kf = StratifiedKFold(4, shuffle=True, random_state=42531) #StratifiedKFold

# Target encoding with K-fold
# te = TargetEncoder(kf.split(train))
te = TargetEncoder(kf.split(train)) #GroupKFold
# te = TargetEncoder(kf.split(train, train[target_col])) #StratifiedKFold

# use fit/fit_transform to train data, then apply transform to test data
train.loc[:, cat_cols] = te.fit_transform(train[cat_cols], train[target_col])
test.loc[:, cat_cols] = te.transform(test[cat_cols])

In [None]:
# 一旦飛ばす
# Co-Visitation Matrix
# session内の遷移確率
# 画像特徴量(一旦パス)
# エリアに関する特徴量
# Testにしか存在しないyado_noをNullにしたほうが良さそう？

In [None]:
params = {
    'objective': "binary", # binary, multiclass, regression
    'metric': "binary_logloss", # mae, mse, auc, binary_logloss, multi_logloss, rmse, average_precision
    'learning_rate':0.05, #0.05
    'n_estimators': 10000,
    'max_depth': -1,
    'random_state' : 42531
    'num_leaves' : 150, #カーディナリティが高い場合は大きくする
    #'max_bin' : 500,
}

result = run_experiment(params,
                        train[num_cols + cat_cols],
                        train[target_col],
                        test[num_cols + cat_cols],
                        # cv=StratifiedKFold(4, shuffle=True, random_state=42531), #cv=4,
                        groups=train.group
                        )