In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
from tqdm import tqdm
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from concurrent.futures import ProcessPoolExecutor, as_completed

In [13]:
train_df = pd.read_parquet('../data/feature_engineering_v8_train_df.parquet')
test_df = pd.read_parquet('../data/feature_engineering_v8_test_df.parquet')

train_label = pd.read_csv('../train_label.csv')
test_session_df = pd.read_csv('../test_session.csv')

train_log = pd.read_csv('../train_log.csv')
test_log = pd.read_csv('../test_log.csv')

In [3]:
# catboost
cat_ver1_train = pd.read_csv('catRanker_ver1_oof_pred.csv')
cat_ver2_train = pd.read_csv('catRanker_ver2_oof_pred.csv')
cat_ver3_train = pd.read_csv('catRanker_ver3_oof_pred.csv')

# lightgbm
lgb_ver8_train = pd.read_csv('lgbRanker_ver8_oof_pred.csv')
lgb_ver9_train = pd.read_csv('lgbRanker_ver9_oof_pred.csv')
lgb_ver10_train = pd.read_csv('lgbRanker_ver10_oof_pred.csv')
lgb_ver11_train = pd.read_csv('lgbRanker_ver11_oof_pred.csv')
lgb_ver12_train = pd.read_csv('lgbRanker_ver12_oof_pred.csv')

# xgboost
xgb_ver1_train = pd.read_csv('xgbRanker_ver1_oof_pred.csv')
xgb_ver2_train = pd.read_csv('xgbRanker_ver2_oof_pred.csv')


In [4]:
cat_ver1_train[['session_id', 'yad_no']]

Unnamed: 0,session_id,yad_no
0,000104bdffaaad1a1e0a9ebacf585f33,3894
1,000104bdffaaad1a1e0a9ebacf585f33,7749
2,000104bdffaaad1a1e0a9ebacf585f33,902
3,000104bdffaaad1a1e0a9ebacf585f33,11380
4,000104bdffaaad1a1e0a9ebacf585f33,5490
...,...,...
2777522,fffe8a472ae6a96c9da05a30ac3ed6c5,4772
2777523,fffe8a472ae6a96c9da05a30ac3ed6c5,1482
2777524,fffe8a472ae6a96c9da05a30ac3ed6c5,4116
2777525,fffe8a472ae6a96c9da05a30ac3ed6c5,10613


In [5]:
preds = pd.DataFrame((cat_ver1_train['oof_pred'] + cat_ver2_train['oof_pred'] + cat_ver3_train['oof_pred'] + \
    lgb_ver8_train['oof_pred'] + lgb_ver9_train['oof_pred'] + lgb_ver10_train['oof_pred'] + lgb_ver11_train['oof_pred'] + lgb_ver12_train['oof_pred'] + \
        xgb_ver1_train['oof_pred'] + xgb_ver2_train['oof_pred']) / 10)

preds

Unnamed: 0,oof_pred
0,0.551566
1,0.136736
2,0.028635
3,-0.229958
4,-0.447370
...,...
2777522,-3.806249
2777523,-3.634917
2777524,-3.256839
2777525,-4.376663


In [6]:
oof_pred_df = pd.concat([cat_ver1_train[['session_id', 'yad_no']], preds], axis=1)
oof_pred_df

Unnamed: 0,session_id,yad_no,oof_pred
0,000104bdffaaad1a1e0a9ebacf585f33,3894,0.551566
1,000104bdffaaad1a1e0a9ebacf585f33,7749,0.136736
2,000104bdffaaad1a1e0a9ebacf585f33,902,0.028635
3,000104bdffaaad1a1e0a9ebacf585f33,11380,-0.229958
4,000104bdffaaad1a1e0a9ebacf585f33,5490,-0.447370
...,...,...,...
2777522,fffe8a472ae6a96c9da05a30ac3ed6c5,4772,-3.806249
2777523,fffe8a472ae6a96c9da05a30ac3ed6c5,1482,-3.634917
2777524,fffe8a472ae6a96c9da05a30ac3ed6c5,4116,-3.256839
2777525,fffe8a472ae6a96c9da05a30ac3ed6c5,10613,-4.376663


### CVの計算

In [7]:
train_log_df = pd.read_csv('../train_log.csv')
train_label_df = pd.read_csv('../train_label.csv')

In [8]:
import pandas as pd

# oof_pred_dfを事前に処理しておきます。
# ここで'session_id'にインデックスを設定し、'oof_pred'でソートします。
# これにより、フィルタリングとソートのコストを削減できます。
oof_pred_df = oof_pred_df.sort_values(['session_id', 'oof_pred'], ascending=[True, False])
oof_pred_df.set_index('session_id', inplace=True)

In [15]:
%%time
def get_top_recommendations(session_id, df):
    """
    この関数は指定されたsession IDに基づいて、推薦確率に基づいた宿泊施設のリストを返します。

    パラメータ:
    session_id (str): 推薦をフィルタリングするためのセッションID。
    df (pd.DataFrame): 推薦が含まれているDataFrame。

    戻り値:
    list: 推薦確率に基づいてソートされた宿泊施設ID(`yad_no`)のリスト。
    """
    try:
        # インデックスを使ってデータを取得
        recommendations = df.loc[session_id, 'yad_no']
        # Seriesオブジェクトの値をリストに変換
        return recommendations.values.tolist()
    except KeyError:
        # 指定されたsession_idが存在しない場合は空のリストを返す
        return []


# 使用例:
session_id_to_check = '000104bdffaaad1a1e0a9ebacf585f33'
recommendations = get_top_recommendations(session_id_to_check, oof_pred_df)
print(recommendations)

[3894, 7749, 902, 11380, 5490, 1284, 254, 12491, 4072]
CPU times: user 599 µs, sys: 0 ns, total: 599 µs
Wall time: 608 µs


In [28]:
print(session_yadno_map.get('000104bdffaaad1a1e0a9ebacf585f33', []))

[96, 898]


In [43]:
# sessionの長さ3の場合、宿の順番の入れ替えを行う

# セッションIDごとに yad_no のリストを作成
session_yadno_map = train_log_df.groupby('session_id')['yad_no'].apply(list).to_dict() #ここをtrain, testで変える  train_log_df, test_log_df


def generate_recommendation(i):
    
    # 推薦する宿を格納するリスト
    pred_recommendations = []
    
    # 指定セッションの宿を抽出
    session_id = train_label_df.loc[i, 'session_id']  #ここをtrain, testで変える  train_label_df, test_session_df
    yado_no = session_yadno_map.get(session_id, [])
    
    # ここでルールベース用に記録
    session_length = len(yado_no) # セッションの長さ（重複排除の前に計算する）
    if session_length == 5:
        pred_4 = yado_no[-2]
        pred_2 = yado_no[-4]
        pred_1 = yado_no[-5]
        pred_3 = yado_no[-3]
    
    
    last_yado = yado_no[-1] if yado_no else None
    yado_no = pd.Series(yado_no).drop_duplicates().tolist()
    
    
    if session_length == 5:
        pred_recommendations.append(pred_4)  
        pred_recommendations.append(pred_2)
        pred_recommendations.append(pred_1)
        pred_recommendations.append(pred_3) 
        # last_yadoを除外 最後に訪問された宿を除外
        if last_yado in pred_recommendations:
            pred_recommendations.remove(last_yado)
    else:
        pred_recommendations.extend(yado_no) 
        # last_yadoを除外 最後に訪問された宿を除外
        if last_yado in pred_recommendations:
            pred_recommendations.remove(last_yado)
    
    # 各セッションのcandidateを並べ替えたもの
    sorted_covisitation = get_top_recommendations(session_id=session_id, df=oof_pred_df)
    pred_recommendations.extend(sorted_covisitation)

    return i, pred_recommendations[:10]


# マルチプロセス処理の実行
results = []
with ProcessPoolExecutor(max_workers=16) as executor: # ここのmax_workersは手持ちのCPUのスレッド数によって適宜変える
    future_to_session = {executor.submit(generate_recommendation, i): i for i in range(len(train_label_df))}
    for future in tqdm(as_completed(future_to_session), total=len(train_label_df), desc="Processing recommendations"):
        index = future_to_session[future]
        result = future.result()
        results.append((index, result))

results.sort(key=lambda x: x[0])
preds = [result for i, result in results]

Processing recommendations: 100%|██████████| 288698/288698 [00:48<00:00, 5942.27it/s]


In [29]:
"""
# セッションIDごとに yad_no のリストを作成
session_yadno_map = train_log_df.groupby('session_id')['yad_no'].apply(list).to_dict() #ここをtrain, testで変える  train_log_df, test_log_df


def generate_recommendation(i):
    
    # 推薦する宿を格納するリスト
    pred_recommendations = []
    
    # 指定セッションの宿を抽出
    session_id = train_label_df.loc[i, 'session_id']  #ここをtrain, testで変える  train_label_df, test_session_df
    yado_no = session_yadno_map.get(session_id, [])
    last_yado = yado_no[-1] if yado_no else None
    yado_no = pd.Series(yado_no).drop_duplicates().tolist()
    
    pred_recommendations.extend(yado_no) # 最後に訪問された宿を除外
    if last_yado in pred_recommendations:
        pred_recommendations.remove(last_yado)
    
    # 各セッションのcandidateを並べ替えたもの
    sorted_covisitation = get_top_recommendations(session_id=session_id, df=oof_pred_df)
    pred_recommendations.extend(sorted_covisitation)

    return i, pred_recommendations[:10]


# マルチプロセス処理の実行
results = []
with ProcessPoolExecutor(max_workers=16) as executor: # ここのmax_workersは手持ちのCPUのスレッド数によって適宜変える
    future_to_session = {executor.submit(generate_recommendation, i): i for i in range(len(train_label_df))}
    for future in tqdm(as_completed(future_to_session), total=len(train_label_df), desc="Processing recommendations"):
        index = future_to_session[future]
        result = future.result()
        results.append((index, result))

results.sort(key=lambda x: x[0])
preds = [result for i, result in results]
"""

Processing recommendations: 100%|██████████| 288698/288698 [00:44<00:00, 6541.14it/s]


In [44]:
submission_df = pd.DataFrame(preds)
submission_df
submission_df[1]

# 推薦リストを新しい列に展開する
result_df = submission_df[1].apply(pd.Series)
# 新しい列名を設定する
result_df.columns = [f'predict_{i}' for i in range(result_df.shape[1])]
# 最初の列にセッションIDを追加する
# result_df.insert(0, 'session_id', submission_df.index)
# 展開されたDataFrameを表示
result_df

Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9
0,2808,4101,997,11882,12846,5289,3324,9208,9209,9207
1,8253,4488,8747,1586,3725,2259,11104,3564,2570,8225
2,9039,4355,4863,5238,1967,6722,7509,2957,13642,11724
3,626,13549,2272,1341,109,7812,755,11715,7872,13296
4,96,3894,7749,902,11380,5490,1284,254,12491,4072
...,...,...,...,...,...,...,...,...,...,...
288693,13079,10955,3725,13210,5719,10522,13717,1586,8677,2876
288694,963,4767,513,2900,8465,5299,6654,399,8703,3940
288695,7308,4040,3566,2087,7820,10364,9558,4398,12240,844
288696,10619,12500,570,11091,7551,3238,12781,12829,10616,11316


### 要素の集合にバグがないかを確認

In [45]:
candidate_ver15_train = pd.read_csv('../data/candidate_ver15_train.csv')

In [46]:
# 既存のDataFrameを読み込む
df1 = candidate_ver15_train
df2 = result_df#.drop(columns=['has_duplicates'])
# 各行を集合として保存するリストを初期化
sets_in_df1 = [set(row) for index, row in df1.iterrows()]
sets_in_df2 = [set(row) for index, row in df2.iterrows()]

# 互いに異なる行の件数をカウントする
unique_count = sum(1 for set1, set2 in zip(sets_in_df1, sets_in_df2) if set1 != set2)

# 結果を出力
print(unique_count)

805


### 順位変動のあったカラムの数を確認する

In [47]:
# 各要素が一致しているかどうかをチェック
diff = candidate_ver15_train != result_df

# 一致していない行のインデックスを抽出
mismatched_indices = diff.any(axis=1)
mismatched_rows = candidate_ver15_train[mismatched_indices]

print("一致していない行のインデックス：")
print(mismatched_indices)
print("一致していない行：")
print(len(mismatched_rows))

一致していない行のインデックス：
0         True
1         True
2         True
3         True
4         True
          ... 
288693    True
288694    True
288695    True
288696    True
288697    True
Length: 288698, dtype: bool
一致していない行：
287138


In [48]:
candidate_ver15_train[mismatched_indices].tail(20)

Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9
288678,6808,237,13660,1601,13644,4612,828,6919,7035,9083
288679,7611,3077,8445,5070,10324,9722,5521,5532,589,9104
288680,2308,13369,1723,7246,13436,2145,5372,9020,13300,3844
288681,12785,385,10857,3811,11214,6217,109,6178,5066,3701
288682,11994,12767,10729,1891,11723,867,5623,3781,634,12620
288683,13302,726,7218,6051,1941,8127,1227,555,10946,2232
288684,1799,1913,6796,10037,9210,7631,5144,705,1701,9401
288685,552,5521,8655,925,8445,6223,9210,9202,9203,9204
288686,255,8156,11025,13423,8645,8689,12344,12599,10669,7860
288687,5372,9623,6655,5411,1868,462,12463,1448,1723,2180


In [49]:
result_df[mismatched_indices].tail(20)

Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9
288678,13660,7035,6808,237,4612,828,9083,1601,6919,13644
288679,7611,3077,8445,9722,5521,10324,5532,589,9104,5070
288680,13300,2145,1723,7246,5372,3844,9020,2308,13436,13369
288681,12785,385,3811,10857,5066,109,11214,6217,6178,3701
288682,11994,12767,10729,867,11723,5623,634,1891,12620,3781
288683,7218,1941,13302,6051,726,8127,1227,10946,555,2232
288684,1701,10037,9401,1799,6796,1913,7631,705,5144,9210
288685,552,925,8445,8655,5521,6223,9210,9204,9202,9203
288686,11025,255,7860,13423,12344,8645,12599,8156,10669,8689
288687,1448,12463,6655,9623,462,1723,5372,5411,2180,1868


In [50]:
train_label.tail(20)

Unnamed: 0,session_id,yad_no
288678,fffc37cadd396eaa102e76d984b315de,237
288679,fffc41b2d7c3ebbb222d52af7aed9083,7611
288680,fffc46c700754a5930e83ce0a7df19c9,2145
288681,fffc996274862c136b754a5d591a3bb3,12785
288682,fffd17b181e26913f6c2ee48e7570a05,11994
288683,fffd2dd57d222b282bd98ec0b5773913,11243
288684,fffd3aa41954b0c90054dd58573109a1,6060
288685,fffd7802cfb01ed4fd26225816f3d794,925
288686,fffd8c4fa7b5be31f0bec4dee4ac6dc8,5209
288687,fffe2d9e5982f5267aacc0704e819bde,4885


In [51]:
y_true = train_label['yad_no'].values.reshape(-1, 1)
y_pred = result_df.values

In [52]:
# Recall(候補にあげたものが)のy_trueに含まれているものの割合

# マッチするかどうかをチェックするリストを初期化
matches = []

# y_pred の各サブアレイと y_true の値を比較
for pred, true in zip(y_pred, y_true):
    # pred はサブアレイ、true は値
    matches.append(int(true in pred))

# matches は、指定された条件に基づいて 0 と 1 のリストです
print(matches)

# matches は先のステップで生成した 0 と 1 のリストです。
# 1 の割合を計算
one_ratio = sum(matches) / len(matches)

# 割合をプリント
print("Recall is: {:.2%}".format(one_ratio))

[1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 

In [53]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the AP at k between two lists of items.
    """
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean AP at k between two lists of list of items.
    """
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

# y_true と y_pred をリストに変換
y_true_list = [list(x) for x in y_true]
y_pred_list = [list(x) for x in y_pred]

# MAP@10 を計算
map_at_10 = mapk(y_true_list, y_pred_list, k=10)
print(f'Map@10  {map_at_10}')

Map@10  0.4428552954826084


In [40]:
# 各行内で重複があるかどうかをチェック
def check_row_duplicates(row):
    # setを用いてユニークな要素のみを抽出し、その長さが元の行の長さと等しいかを比較
    return len(row) != len(set(row))
# check_row_duplicates関数を各行に適用し、結果を新しい列に格納
result_df['has_duplicates'] = result_df.apply(check_row_duplicates, axis=1)
# 重複がある行のみをフィルタリング
rows_with_duplicates = result_df[result_df['has_duplicates']]
# 結果を表示
print(len(rows_with_duplicates))

0


In [41]:
# 欠損がないかをチェック
result_df.isnull().sum(axis=1).sum()

0

In [67]:
result_df.head(20)

Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9,has_duplicates
0,2808,4101,997,11882,12846,5289,3324,9208,9209,9207,False
1,8253,4488,8747,1586,3725,2259,11104,3564,2570,8225,False
2,9039,4355,4863,5238,1967,6722,7509,2957,13642,11724,False
3,626,13549,2272,1341,109,7812,755,11715,7872,13296,False
4,96,3894,7749,902,11380,5490,1284,254,12491,4072,False
5,4823,10510,12544,5369,9199,9207,9201,9202,9200,1,False
6,4574,7531,12774,2282,2480,7273,10378,441,13240,10442,False
7,10362,111,12962,12464,9508,7681,4744,12125,10544,1755,False
8,899,3644,1227,3802,4014,9723,13220,12432,2164,13702,False
9,3278,10478,6592,9773,3483,379,4303,2806,550,9430,False


In [70]:
train_label.head(20)

Unnamed: 0,session_id,yad_no
0,000007603d533d30453cc45d0f3d119f,4101
1,0000ca043ed437a1472c9d1d154eb49b,8253
2,0000d4835cf113316fe447e2f80ba1c8,4863
3,0000fcda1ae1b2f431e55a7075d1f500,1652
4,000104bdffaaad1a1e0a9ebacf585f33,96
5,00011afe25c343301ee961b3b0ce2e4d,4823
6,000125c737df1802b6e365f93c96d3c8,10378
7,0001763050a10b21062a1304fb743fd4,10362
8,000178c4d4d567d4715331dd0cdab76c,1227
9,0001e6a407a85dc50ac132a5c7016bab,175


### 推論用の候補テーブル作成

In [53]:
test_log_df = pd.read_csv('../test_log.csv')
test_session_df = pd.read_csv('../test_session.csv')

In [43]:
# catboost
cat_ver1_test = pd.read_csv('catRanker_ver1_test_pred.csv')
cat_ver2_test = pd.read_csv('catRanker_ver2_test_pred.csv')
cat_ver3_test = pd.read_csv('catRanker_ver3_test_pred.csv')

# lightgbm
lgb_ver8_test = pd.read_csv('lgbRanker_ver8_test_pred.csv')
lgb_ver9_test = pd.read_csv('lgbRanker_ver9_test_pred.csv')
lgb_ver10_test = pd.read_csv('lgbRanker_ver10_test_pred.csv')
lgb_ver11_test = pd.read_csv('lgbRanker_ver11_test_pred.csv')
lgb_ver12_test = pd.read_csv('lgbRanker_ver12_test_pred.csv')

# xgboost
xgb_ver1_test = pd.read_csv('xgbRanker_ver1_test_pred.csv')
xgb_ver2_test = pd.read_csv('xgbRanker_ver2_test_pred.csv')

In [None]:
preds = pd.DataFrame((cat_ver1_train['oof_pred'] + cat_ver2_train['oof_pred'] + cat_ver3_train['oof_pred'] + \
    lgb_ver8_train['oof_pred'] + lgb_ver9_train['oof_pred'] + lgb_ver10_train['oof_pred'] + lgb_ver11_train['oof_pred'] + lgb_ver12_train['oof_pred'] + \
        xgb_ver1_train['oof_pred'] + xgb_ver2_train['oof_pred']) / 10)

preds

In [45]:
cat_ver1_test

Unnamed: 0,session_id,yad_no,oof_pred
0,00001149e9c73985425197104712478c,11561,1.570790
1,00001149e9c73985425197104712478c,4714,0.677842
2,00001149e9c73985425197104712478c,2680,0.824470
3,00001149e9c73985425197104712478c,4420,-0.099888
4,00001149e9c73985425197104712478c,5466,0.028827
...,...,...,...
1683599,ffffe984aafd6127ce8e43e3ca40c79d,5623,0.565699
1683600,ffffe984aafd6127ce8e43e3ca40c79d,3781,0.544171
1683601,ffffe984aafd6127ce8e43e3ca40c79d,11994,0.449846
1683602,ffffe984aafd6127ce8e43e3ca40c79d,634,0.246095


In [46]:
preds_test = pd.DataFrame((cat_ver1_test['oof_pred'] + cat_ver2_test['oof_pred'] + cat_ver3_test['oof_pred'] + \
    lgb_ver8_test['oof_pred'] + lgb_ver9_test['oof_pred'] + lgb_ver10_test['oof_pred'] + lgb_ver11_test['oof_pred'] + lgb_ver12_test['oof_pred'] + \
        xgb_ver1_test['oof_pred'] + xgb_ver2_test['oof_pred']) / 10)

preds_test

Unnamed: 0,oof_pred
0,0.483822
1,-0.317892
2,0.211396
3,-0.616863
4,-0.367900
...,...
1683599,0.159987
1683600,0.132216
1683601,0.117556
1683602,0.095074


In [47]:
test_oof_pred_df = pd.concat([cat_ver1_test[['session_id', 'yad_no']], preds_test], axis=1)
test_oof_pred_df

Unnamed: 0,session_id,yad_no,oof_pred
0,00001149e9c73985425197104712478c,11561,0.483822
1,00001149e9c73985425197104712478c,4714,-0.317892
2,00001149e9c73985425197104712478c,2680,0.211396
3,00001149e9c73985425197104712478c,4420,-0.616863
4,00001149e9c73985425197104712478c,5466,-0.367900
...,...,...,...
1683599,ffffe984aafd6127ce8e43e3ca40c79d,5623,0.159987
1683600,ffffe984aafd6127ce8e43e3ca40c79d,3781,0.132216
1683601,ffffe984aafd6127ce8e43e3ca40c79d,11994,0.117556
1683602,ffffe984aafd6127ce8e43e3ca40c79d,634,0.095074


In [48]:
import pandas as pd

# oof_pred_dfを事前に処理しておきます。
# ここで'session_id'にインデックスを設定し、'oof_pred'でソートします。
# これにより、フィルタリングとソートのコストを削減できます。
test_oof_pred_df = test_oof_pred_df.sort_values(['session_id', 'oof_pred'], ascending=[True, False])
test_oof_pred_df.set_index('session_id', inplace=True)

In [49]:
%%time
def get_top_recommendations(session_id, df):
    """
    この関数は指定されたsession IDに基づいて、推薦確率に基づいた宿泊施設のリストを返します。

    パラメータ:
    session_id (str): 推薦をフィルタリングするためのセッションID。
    df (pd.DataFrame): 推薦が含まれているDataFrame。

    戻り値:
    list: 推薦確率に基づいてソートされた宿泊施設ID(`yad_no`)のリスト。
    """
    try:
        # インデックスを使ってデータを取得
        recommendations = df.loc[session_id, 'yad_no']
        # Seriesオブジェクトの値をリストに変換
        return recommendations.values.tolist()
    except KeyError:
        # 指定されたsession_idが存在しない場合は空のリストを返す
        return []


# 使用例:
session_id_to_check = 'ffffe984aafd6127ce8e43e3ca40c79d'
recommendations = get_top_recommendations(session_id_to_check, test_oof_pred_df)
print(recommendations)

[10729, 12767, 11723, 1891, 5623, 3781, 12620, 867, 11994, 634]
CPU times: user 92.1 ms, sys: 0 ns, total: 92.1 ms
Wall time: 91.8 ms


In [54]:
# セッションIDごとに yad_no のリストを作成
session_yadno_map = test_log_df.groupby('session_id')['yad_no'].apply(list).to_dict() #ここをtrain, testで変える  train_log_df, test_log_df


def generate_recommendation(i):
    
    # 推薦する宿を格納するリスト
    pred_recommendations = []
    
    # 指定セッションの宿を抽出
    session_id = test_session_df.loc[i, 'session_id']  #ここをtrain, testで変える  train_label_df, test_session_df
    yado_no = session_yadno_map.get(session_id, [])
    last_yado = yado_no[-1] if yado_no else None
    yado_no = pd.Series(yado_no).drop_duplicates().tolist()
    
    pred_recommendations.extend(yado_no) # 最後に訪問された宿を除外
    if last_yado in pred_recommendations:
        pred_recommendations.remove(last_yado)
    
    # 各セッションのcandidateを並べ替えたもの
    sorted_covisitation = get_top_recommendations(session_id=session_id, df=test_oof_pred_df)
    pred_recommendations.extend(sorted_covisitation)

    return i, pred_recommendations[:10]


# マルチプロセス処理の実行
results = []
with ProcessPoolExecutor(max_workers=16) as executor: # ここのmax_workersは手持ちのCPUのスレッド数によって適宜変える
    future_to_session = {executor.submit(generate_recommendation, i): i for i in range(len(test_session_df))}
    for future in tqdm(as_completed(future_to_session), total=len(test_session_df), desc="Processing recommendations"):
        index = future_to_session[future]
        result = future.result()
        results.append((index, result))

results.sort(key=lambda x: x[0])
preds = [result for i, result in results]

Processing recommendations: 100%|██████████| 174700/174700 [00:26<00:00, 6504.94it/s]


In [55]:
submission_df = pd.DataFrame(preds)
submission_df
submission_df[1]

# 推薦リストを新しい列に展開する
test_expanded_df = submission_df[1].apply(pd.Series)
# 新しい列名を設定する
test_expanded_df.columns = [f'predict_{i}' for i in range(test_expanded_df.shape[1])]
# 最初の列にセッションIDを追加する
# expanded_df.insert(0, 'session_id', submission_df.index)
# 展開されたDataFrameを表示
test_expanded_df

Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9
0,3560,11561,2680,9830,4714,5466,4420,9534,5785,6563
1,8108,7014,11923,613,143,4066,6555,6129,11237,12862
2,757,9190,7710,9910,1774,410,6721,10485,6730,13570
3,12341,3359,6991,10861,13521,1542,5080,4180,6489,10746
4,10826,3476,9623,9020,2862,3854,12029,3844,5372,6161
...,...,...,...,...,...,...,...,...,...,...
174695,1997,7888,11123,5744,7062,2278,10997,10042,3440,9743
174696,899,3802,4014,1227,3644,9723,13220,12432,13702,2164
174697,12939,7308,13719,2087,13797,11796,11037,13241,8143,5810
174698,2373,2692,10287,3002,3100,4976,1687,5513,12281,13672


In [56]:
# 各行内で重複があるかどうかをチェック

def check_row_duplicates(row):
    # setを用いてユニークな要素のみを抽出し、その長さが元の行の長さと等しいかを比較
    return len(row) != len(set(row))
# check_row_duplicates関数を各行に適用し、結果を新しい列に格納
test_expanded_df['has_duplicates'] = test_expanded_df.apply(check_row_duplicates, axis=1)
# 重複がある行のみをフィルタリング
rows_with_duplicates = test_expanded_df[test_expanded_df['has_duplicates']]
# 結果を表示
print(len(rows_with_duplicates))

0


In [57]:
# 欠損がないかをチェック
test_expanded_df.isnull().sum(axis=1).sum()

0

In [58]:
candidate_ver15_test = pd.read_csv('../data/candidate_ver15_test.csv')
candidate_ver15_test

Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9
0,3560,11561,4714,2680,4420,5466,9830,9534,6563,5785
1,143,4066,6555,7014,11923,8108,6129,613,11237,12862
2,757,9190,7710,1774,9910,10485,6721,410,13570,6730
3,12341,3359,6991,1542,13521,5080,10861,4180,10746,6489
4,9020,2862,10826,12029,3854,3476,9623,3844,5372,6161
...,...,...,...,...,...,...,...,...,...,...
174695,1997,7888,5744,11123,10997,7062,9743,2278,10042,3440
174696,13220,12432,899,4014,3802,1227,3644,2164,13702,9723
174697,13241,13797,13719,12939,7308,2087,8143,11796,11037,5810
174698,3100,10287,3002,2373,12281,4976,13672,2692,5513,1687


In [59]:
# 行ごとに要素の集合が異なっていないかをチェック
import pandas as pd

# 既存のDataFrameを読み込む
df1 = candidate_ver15_test
df2 = test_expanded_df.drop('has_duplicates', axis=1)
# 各行を集合として保存するリストを初期化
sets_in_df1 = [set(row) for index, row in df1.iterrows()]
sets_in_df2 = [set(row) for index, row in df2.iterrows()]

# 互いに異なる行の件数をカウントする
unique_count = sum(1 for set1, set2 in zip(sets_in_df1, sets_in_df2) if set1 != set2)

# 結果を出力
print(unique_count)

0


In [60]:
candidate_ver15_test

Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9
0,3560,11561,4714,2680,4420,5466,9830,9534,6563,5785
1,143,4066,6555,7014,11923,8108,6129,613,11237,12862
2,757,9190,7710,1774,9910,10485,6721,410,13570,6730
3,12341,3359,6991,1542,13521,5080,10861,4180,10746,6489
4,9020,2862,10826,12029,3854,3476,9623,3844,5372,6161
...,...,...,...,...,...,...,...,...,...,...
174695,1997,7888,5744,11123,10997,7062,9743,2278,10042,3440
174696,13220,12432,899,4014,3802,1227,3644,2164,13702,9723
174697,13241,13797,13719,12939,7308,2087,8143,11796,11037,5810
174698,3100,10287,3002,2373,12281,4976,13672,2692,5513,1687


In [61]:
test_expanded_df

Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9,has_duplicates
0,3560,11561,2680,9830,4714,5466,4420,9534,5785,6563,False
1,8108,7014,11923,613,143,4066,6555,6129,11237,12862,False
2,757,9190,7710,9910,1774,410,6721,10485,6730,13570,False
3,12341,3359,6991,10861,13521,1542,5080,4180,6489,10746,False
4,10826,3476,9623,9020,2862,3854,12029,3844,5372,6161,False
...,...,...,...,...,...,...,...,...,...,...,...
174695,1997,7888,11123,5744,7062,2278,10997,10042,3440,9743,False
174696,899,3802,4014,1227,3644,9723,13220,12432,13702,2164,False
174697,12939,7308,13719,2087,13797,11796,11037,13241,8143,5810,False
174698,2373,2692,10287,3002,3100,4976,1687,5513,12281,13672,False


In [62]:
test_expanded_df.drop(columns=['has_duplicates']).to_csv('../submissions/ensemble_ver1.csv', index=False)