In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import combinations
from tqdm import tqdm

In [36]:
train_log_df = pd.read_csv('../train_log.csv')
test_log_df = pd.read_csv('../test_log.csv')
train_label_df = pd.read_csv('../train_label.csv')
test_session_df = pd.read_csv('../test_session.csv')
yado_df = pd.read_csv('../yado.csv')
image_embeddings_df = pd.read_parquet('../image_embeddings.parquet')
sample_submission_df = pd.read_csv('../sample_submission.csv')

In [37]:
train_log_df

Unnamed: 0,session_id,seq_no,yad_no
0,000007603d533d30453cc45d0f3d119f,0,2395
1,0000ca043ed437a1472c9d1d154eb49b,0,13535
2,0000d4835cf113316fe447e2f80ba1c8,0,123
3,0000fcda1ae1b2f431e55a7075d1f500,0,8475
4,000104bdffaaad1a1e0a9ebacf585f33,0,96
...,...,...,...
419265,ffffcd5bc19d62cad5a3815c87818d83,0,12230
419266,ffffcd5bc19d62cad5a3815c87818d83,1,10619
419267,ffffcd5bc19d62cad5a3815c87818d83,2,12230
419268,fffffa7baf370083ebcdd98f26a7e31a,0,2439


In [38]:
train_label_df

Unnamed: 0,session_id,yad_no
0,000007603d533d30453cc45d0f3d119f,4101
1,0000ca043ed437a1472c9d1d154eb49b,8253
2,0000d4835cf113316fe447e2f80ba1c8,4863
3,0000fcda1ae1b2f431e55a7075d1f500,1652
4,000104bdffaaad1a1e0a9ebacf585f33,96
...,...,...
288693,ffff2262d38abdeb247ebd591835dcc9,2259
288694,ffff2360540745117193ecadcdc06538,963
288695,ffff7fb4617164b2604aaf51c40bf82d,13719
288696,ffffcd5bc19d62cad5a3815c87818d83,10619


In [39]:
train_label_df['session_id'].nunique()

288698

In [40]:
test_log_df

Unnamed: 0,session_id,seq_no,yad_no
0,00001149e9c73985425197104712478c,0,3560
1,00001149e9c73985425197104712478c,1,1959
2,0000e02747d749a52b7736dfa751e258,0,11984
3,0000f17ae2628237d78d3a38b009d3be,0,757
4,0000f17ae2628237d78d3a38b009d3be,1,8922
...,...,...,...
250300,fffee3199ef94b92283239cd5e3534fa,1,8336
250301,ffff62c6bb49bc9c0fbcf08494a4869c,0,12062
250302,ffff9a7dcc892875c7a8b821fa436228,0,8989
250303,ffffb1d30300fe17f661941fd085b04b,0,6030


In [41]:
pd.concat([train_log_df, test_log_df], axis=0, ignore_index=True)

Unnamed: 0,session_id,seq_no,yad_no
0,000007603d533d30453cc45d0f3d119f,0,2395
1,0000ca043ed437a1472c9d1d154eb49b,0,13535
2,0000d4835cf113316fe447e2f80ba1c8,0,123
3,0000fcda1ae1b2f431e55a7075d1f500,0,8475
4,000104bdffaaad1a1e0a9ebacf585f33,0,96
...,...,...,...
669570,fffee3199ef94b92283239cd5e3534fa,1,8336
669571,ffff62c6bb49bc9c0fbcf08494a4869c,0,12062
669572,ffff9a7dcc892875c7a8b821fa436228,0,8989
669573,ffffb1d30300fe17f661941fd085b04b,0,6030


In [42]:
# trainとtestをconcatした共訪問行列の作成
def create_covisitation_matrix(combined_log_df):
    # セッションごとに訪問された宿の組み合わせを生成
    pairs = []
    for session_id, group in tqdm(combined_log_df.groupby('session_id'), desc="Processing sessions"):
        accommodations = group['yad_no'].tolist()
        for pair in combinations(accommodations, 2):
            pairs.append(pair)

    # 共訪問のカウント
    pairs_df = pd.DataFrame(pairs, columns=['yad_no1', 'yad_no2'])
    covisitation_counts = pairs_df.groupby(['yad_no1', 'yad_no2']).size().reset_index(name='count')

    # 行列の作成
    covisitation_matrix = covisitation_counts.pivot(index='yad_no1', columns='yad_no2', values='count').fillna(0)

    return covisitation_matrix

# train_log_df と test_log_df を結合
combined_log_df = pd.concat([train_log_df, test_log_df], axis=0, ignore_index=True)

# 共訪問行列の作成
covisitation_matrix = create_covisitation_matrix(combined_log_df)
covisitation_matrix

Processing sessions: 100%|██████████| 463398/463398 [00:28<00:00, 16380.75it/s]


yad_no2,1,2,3,4,5,7,9,10,12,15,...,13796,13797,13798,13799,13800,13801,13803,13804,13805,13806
yad_no1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0
13801,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0
13803,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
13804,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0


In [43]:
covisitation_matrix

yad_no2,1,2,3,4,5,7,9,10,12,15,...,13796,13797,13798,13799,13800,13801,13803,13804,13805,13806
yad_no1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0
13801,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0
13803,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
13804,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0


In [44]:
yado_df

Unnamed: 0,yad_no,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd
0,1,0,129.0,1.0,0,1.0,,,1.0,f0112abf369fb03cdc5f5309300913da,072c85e1653e10c9c7dd065ad007125a,449c52ef581d5f9ef311189469a0520e,677a32689cd1ad74e867f1fbe43a3e1c
1,2,0,23.0,1.0,0,,,,,d86102dd9c232bade9a97dccad40df48,b4d2fb4e51ea7bca80eb1270aa474a54,5c9a8f48e9df0234da012747a02d4b29,4ee16ee838dd2703cc9a1d5a535f0ced
2,3,0,167.0,1.0,1,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,572d60f0f5212aacda515ebf81fb0a3a,8a623b960557e87bd1f4edf71b6255be,ab9480fd72a44d51690ab16c4ad4d49c
3,4,0,144.0,1.0,0,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52c9ea83f2cfe92be54cb6bc961edf21,1cc3e1838bb0fd0fde0396130b1f82b9
4,5,0,41.0,1.0,1,,,,,43875109d1dab93592812c50d18270a7,75617bb07a2785a948ab1958909211f1,9ea5a911019b66ccd42f556c42a2fe2f,be1b876af18afc4deeb3081591d2a910
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13801,13802,0,10.0,1.0,1,,,,,c312e07b7a5d456d53a5b00910a336e1,558ac1909f0318b82c621ab250329d6d,80fb3c5ad0c89931d0923e9f80885218,5eb30820716082c720836733d73c605e
13802,13803,0,,,0,1.0,,,1.0,dc414a17890cfc17d011d5038b88ca93,d78f53d0856617bc782f02c3280dfef2,e5cfcc0a43c82072aca11628ff0add53,20ad8785a30f125bee5a8a325782ab06
13803,13804,0,80.0,1.0,1,,1.0,,1.0,d86102dd9c232bade9a97dccad40df48,7d76599bd27ff9e7823b2b1323ca763e,c5fe8848b6ab39b040cdb3668aea9433,b3eab50ccf6ffb51c37d36ee384abfbf
13804,13805,0,8.0,1.0,1,,,,1.0,3300cf6f774b7c6a5807110f244cbc21,689cf8289e7ea0b2eef1b017dcdfe8de,8b712435430a6875839a6c3b5a40b008,2b4165444a777465576b25f65697d739


In [45]:
# ステップ 1: 指定セッションの宿を抽出
def get_session_accommodations(session_id, train_log_df):
    return train_log_df[train_log_df['session_id'] == session_id]['yad_no'].unique()

get_session_accommodation = get_session_accommodations(session_id='000007603d533d30453cc45d0f3d119f', train_log_df=train_log_df)
print(get_session_accommodation)

[2395]


In [46]:
"""
# 重複が生じるやり方
# ステップ 2: 訪問行列に基づく推薦
def get_recommendations_from_covisitation(yad_no, covisitation_matrix, num_recommendations=10):
    # 特定の宿に関連する宿の共訪問回数を取得
    related_accommodations = covisitation_matrix.loc[yad_no].sort_values(ascending=False)

    # トップの推薦を選出
    top_recommendations = related_accommodations.head(num_recommendations).index.tolist()
    return top_recommendations
"""

def get_recommendations_from_covisitation(yad_no, covisitation_matrix, exclude_list=None, num_recommendations=10):
    # 特定の宿に関連する宿の共訪問回数を取得
    related_accommodations = covisitation_matrix.loc[yad_no].sort_values(ascending=False)
    
    # 除外リストを考慮してトップの推薦を選出
    top_recommendations = []
    for accommodation in related_accommodations.index:
        if accommodation not in exclude_list:
            top_recommendations.append(accommodation)
            if len(top_recommendations) == num_recommendations:
                break
    
    return top_recommendations

# 例: 宿IDが100の宿に対する推薦を取得（covisitation_matrixは前のステップで作成されたものとする）
recommendations = get_recommendations_from_covisitation(3338, covisitation_matrix, exclude_list=[])
print(recommendations)

[12350, 3338, 915, 13468, 532, 8553, 3848, 10118, 4913, 1091]


In [47]:
# ステップ3：全体の人気順に基づく補完
def get_popular_accommodations(train_log_df, exclude_list, num_recommendations=10):
    # 各宿の訪問回数をカウント
    popularity_counts = train_log_df['yad_no'].value_counts()

    # 重複を除外
    popular_accommodations = popularity_counts.index.tolist()
    recommendations = [yad_no for yad_no in popular_accommodations if yad_no not in exclude_list]

    # トップの推薦を選出
    top_recommendations = recommendations[:num_recommendations]
    return top_recommendations

# 例: 除外リスト（既に選ばれた候補）に基づいて追加の推薦を取得
# exclude_list = [100, 101, 102]  # 既に選ばれた宿のIDリスト
additional_recommendations = get_popular_accommodations(train_log_df, exclude_list=[])
print(additional_recommendations)

[3338, 12350, 10095, 13468, 8567, 719, 532, 8553, 915, 3848]


In [48]:
test_session_df

Unnamed: 0,session_id
0,00001149e9c73985425197104712478c
1,0000e02747d749a52b7736dfa751e258
2,0000f17ae2628237d78d3a38b009d3be
3,000174a6f7a569b84c5575760d2e9664
4,00017e2a527901c9c41b1acef525d016
...,...
174695,fffee3199ef94b92283239cd5e3534fa
174696,ffff62c6bb49bc9c0fbcf08494a4869c
174697,ffff9a7dcc892875c7a8b821fa436228
174698,ffffb1d30300fe17f661941fd085b04b


In [49]:
test_log_df

Unnamed: 0,session_id,seq_no,yad_no
0,00001149e9c73985425197104712478c,0,3560
1,00001149e9c73985425197104712478c,1,1959
2,0000e02747d749a52b7736dfa751e258,0,11984
3,0000f17ae2628237d78d3a38b009d3be,0,757
4,0000f17ae2628237d78d3a38b009d3be,1,8922
...,...,...,...
250300,fffee3199ef94b92283239cd5e3534fa,1,8336
250301,ffff62c6bb49bc9c0fbcf08494a4869c,0,12062
250302,ffff9a7dcc892875c7a8b821fa436228,0,8989
250303,ffffb1d30300fe17f661941fd085b04b,0,6030


In [50]:
# 注意！！！テスト用
# マルチスレッドでの処理(チェック用の1万件)
"""
import concurrent.futures
from tqdm import tqdm

def process_recommendation(i):
    pred_recommendations = []

    session_id = test_session_df.loc[i, 'session_id']
    yado_no = test_log_df[test_log_df['session_id'] == session_id]['yad_no'].unique()
    pred_recommendations.extend(yado_no)  # 既に訪問した宿を推薦候補に追加

    for y in yado_no:
        try:
            yado_covisit = get_recommendations_from_covisitation(y, covisitation_matrix)
            pred_recommendations.extend(yado_covisit)  # 共訪問行列に基づく推薦を追加
        except:  # 共訪問行列に存在しない宿はスキップ
            pass

    additional_recommendations = get_popular_accommodations(train_log_df, exclude_list=pred_recommendations)  # 全体の人気順に基づく補完
    pred_recommendations.extend(additional_recommendations)  # 補完した推薦を追加

    return pred_recommendations[:10]  # トップ10の推薦を選出

# マルチスレッドでの処理
preds = []
with concurrent.futures.ThreadPoolExecutor() as executor:
    # tqdmを使って進捗を表示
    results = list(tqdm(executor.map(process_recommendation, range(10000)), total=10000))
    preds.extend(results)
"""

"\nimport concurrent.futures\nfrom tqdm import tqdm\n\ndef process_recommendation(i):\n    pred_recommendations = []\n\n    session_id = test_session_df.loc[i, 'session_id']\n    yado_no = test_log_df[test_log_df['session_id'] == session_id]['yad_no'].unique()\n    pred_recommendations.extend(yado_no)  # 既に訪問した宿を推薦候補に追加\n\n    for y in yado_no:\n        try:\n            yado_covisit = get_recommendations_from_covisitation(y, covisitation_matrix)\n            pred_recommendations.extend(yado_covisit)  # 共訪問行列に基づく推薦を追加\n        except:  # 共訪問行列に存在しない宿はスキップ\n            pass\n\n    additional_recommendations = get_popular_accommodations(train_log_df, exclude_list=pred_recommendations)  # 全体の人気順に基づく補完\n    pred_recommendations.extend(additional_recommendations)  # 補完した推薦を追加\n\n    return pred_recommendations[:10]  # トップ10の推薦を選出\n\n# マルチスレッドでの処理\npreds = []\nwith concurrent.futures.ThreadPoolExecutor() as executor:\n    # tqdmを使って進捗を表示\n    results = list(tqdm(executor.map(process_recommen

In [51]:
"""
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

def generate_recommendation(i):
    pred_recommendations = []
    
    session_id = test_session_df.loc[i, 'session_id']
    yado_no = test_log_df[test_log_df['session_id'] == session_id]['yad_no'].unique()
    pred_recommendations.extend(yado_no)  # 既に訪問した宿を推薦候補に追加
    
    for y in yado_no:
        try:
            yado_covisit = get_recommendations_from_covisitation(y, covisitation_matrix)
            pred_recommendations.extend(yado_covisit)  # 共訪問行列に基づく推薦を追加
        except:  # 共訪問行列に存在しない宿はスキップ
            pass
    
    additional_recommendations = get_popular_accommodations(train_log_df, exclude_list=pred_recommendations)  # 全体の人気順に基づく補完
    pred_recommendations.extend(additional_recommendations)  # 補完した推薦を追加
    
    return pred_recommendations[:10]  # トップ10の推薦を選出


# マルチスレッドでの推薦処理
preds = []
with ThreadPoolExecutor(max_workers=16) as executor:
    # すべてのセッションに対するFutureを辞書として保持
    future_to_session = {executor.submit(generate_recommendation, i): i for i in range(len(test_session_df))}
    # as_completed を使って完了したタスクの結果を取得し、tqdm で進捗を表示
    for future in tqdm(as_completed(future_to_session), total=len(test_session_df), desc="Processing recommendations"):
        preds.append(future.result())
"""

'\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\nfrom tqdm import tqdm\n\ndef generate_recommendation(i):\n    pred_recommendations = []\n    \n    session_id = test_session_df.loc[i, \'session_id\']\n    yado_no = test_log_df[test_log_df[\'session_id\'] == session_id][\'yad_no\'].unique()\n    pred_recommendations.extend(yado_no)  # 既に訪問した宿を推薦候補に追加\n    \n    for y in yado_no:\n        try:\n            yado_covisit = get_recommendations_from_covisitation(y, covisitation_matrix)\n            pred_recommendations.extend(yado_covisit)  # 共訪問行列に基づく推薦を追加\n        except:  # 共訪問行列に存在しない宿はスキップ\n            pass\n    \n    additional_recommendations = get_popular_accommodations(train_log_df, exclude_list=pred_recommendations)  # 全体の人気順に基づく補完\n    pred_recommendations.extend(additional_recommendations)  # 補完した推薦を追加\n    \n    return pred_recommendations[:10]  # トップ10の推薦を選出\n\n\n# マルチスレッドでの推薦処理\npreds = []\nwith ThreadPoolExecutor(max_workers=16) as executor:\n    # すべてのセッショ

In [52]:
"""
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

def generate_recommendation(i):
    pred_recommendations = []
    
    session_id = test_session_df.loc[i, 'session_id']
    yado_no = test_log_df[test_log_df['session_id'] == session_id]['yad_no'].unique()
    pred_recommendations.extend(yado_no)  # 既に訪問した宿を推薦候補に追加
    
    for y in yado_no:
        try:
            yado_covisit = get_recommendations_from_covisitation(y, covisitation_matrix)
            pred_recommendations.extend(yado_covisit)  # 共訪問行列に基づく推薦を追加
        except:  # 共訪問行列に存在しない宿はスキップ
            pass
    
    additional_recommendations = get_popular_accommodations(train_log_df, exclude_list=pred_recommendations)  # 全体の人気順に基づく補完
    pred_recommendations.extend(additional_recommendations)  # 補完した推薦を追加
    
    return i, pred_recommendations[:10]  # トップ10の推薦を選出


results = []
with ThreadPoolExecutor(max_workers=16) as executor:
    future_to_session = {executor.submit(generate_recommendation, i): i for i in range(len(test_session_df[:1000]))}
    for future in tqdm(as_completed(future_to_session), total=len(test_session_df[:1000]), desc="Processing recommendations"):
        index = future_to_session[future]
        result = future.result()
        results.append((index, result))

# 結果をインデックスに基づいて並び替え
results.sort(key=lambda x: x[0])
preds = [result for i, result in results]
"""

'\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\nfrom tqdm import tqdm\n\ndef generate_recommendation(i):\n    pred_recommendations = []\n    \n    session_id = test_session_df.loc[i, \'session_id\']\n    yado_no = test_log_df[test_log_df[\'session_id\'] == session_id][\'yad_no\'].unique()\n    pred_recommendations.extend(yado_no)  # 既に訪問した宿を推薦候補に追加\n    \n    for y in yado_no:\n        try:\n            yado_covisit = get_recommendations_from_covisitation(y, covisitation_matrix)\n            pred_recommendations.extend(yado_covisit)  # 共訪問行列に基づく推薦を追加\n        except:  # 共訪問行列に存在しない宿はスキップ\n            pass\n    \n    additional_recommendations = get_popular_accommodations(train_log_df, exclude_list=pred_recommendations)  # 全体の人気順に基づく補完\n    pred_recommendations.extend(additional_recommendations)  # 補完した推薦を追加\n    \n    return i, pred_recommendations[:10]  # トップ10の推薦を選出\n\n\nresults = []\nwith ThreadPoolExecutor(max_workers=16) as executor:\n    future_to_session = {e

In [53]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

def generate_recommendation(i):
    pred_recommendations = []
    
    session_id = test_session_df.loc[i, 'session_id']
    yado_no = test_log_df[test_log_df['session_id'] == session_id]['yad_no'].unique()
    pred_recommendations.extend(yado_no)  # 既に訪問した宿を推薦候補に追加
    
    for y in yado_no:
        try:
            yado_covisit = get_recommendations_from_covisitation(y, covisitation_matrix, exclude_list=pred_recommendations) # 除外リストを考慮, exclude_list=pred_recommendationsを加える
            pred_recommendations.extend(yado_covisit)  # 共訪問行列に基づく推薦を追加
        except:  # 共訪問行列に存在しない宿はスキップ
            pass
    
    additional_recommendations = get_popular_accommodations(train_log_df, exclude_list=pred_recommendations)  # 全体の人気順に基づく補完, exclude_list=pred_recommendationsを加える
    pred_recommendations.extend(additional_recommendations)  # 補完した推薦を追加
    
    return i, pred_recommendations[:10]  # トップ10の推薦を選出


results = []
with ThreadPoolExecutor(max_workers=16) as executor:
    future_to_session = {executor.submit(generate_recommendation, i): i for i in range(len(test_session_df))}
    for future in tqdm(as_completed(future_to_session), total=len(test_session_df), desc="Processing recommendations"):
        index = future_to_session[future]
        result = future.result()
        results.append((index, result))

# 結果をインデックスに基づいて並び替え
results.sort(key=lambda x: x[0])
preds = [result for i, result in results]

In [None]:
"""
# テスト用のデータフレームを作成
submission_df = pd.DataFrame(preds)
submission_df.columns =['predict_0', 'predict_1', 'predict_2', 'predict_3', 'predict_4', 'predict_5', 'predict_6', 'predict_7', 'predict_8', 'predict_9']
submission_df
"""

Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9
0,3560,1959,4545,9534,3560,1959,6563,4420,5785,5466
1,11984,143,11984,4066,6555,8108,7014,613,11237,6129
2,757,8922,757,7710,9190,410,8922,9910,1774,10485
3,13610,12341,12341,3359,1542,13610,6991,10861,4180,13521
4,4621,9020,2862,10826,4621,1448,6161,6126,9623,11480
...,...,...,...,...,...,...,...,...,...,...
9995,2914,7490,7490,2914,785,11893,5358,9429,5350,4440
9996,4759,12852,11064,2071,13213,4157,4443,4759,12852,9185
9997,12962,6074,12962,13106,4065,3329,7036,7681,9508,12464
9998,2658,9347,3186,6395,7536,4940,8286,6133,9126,7692


In [None]:
"""
submission_df = pd.DataFrame(preds)
submission_df.columns =['predict_0', 'predict_1', 'predict_2', 'predict_3', 'predict_4', 'predict_5', 'predict_6', 'predict_7', 'predict_8', 'predict_9']
submission_df
"""


Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9
0,8271,3662,9026,1465,5228,5849,3833,1696,8197,5031
1,6290,7197,12470,7197,12771,4389,8816,11264,6342,5099
2,6960,10196,6960,10586,1553,1547,12980,5952,9598,12249
3,4155,1705,7445,4155,994,8406,10496,3504,12790,9185
4,576,10209,1,9176,9177,9178,9180,9181,9182,9183
...,...,...,...,...,...,...,...,...,...,...
174695,8250,12767,10729,8250,1891,11723,867,3781,8648,5623
174696,6030,3100,6030,13672,2373,3002,5513,1687,4976,12281
174697,3720,10774,7222,10412,3720,10621,4507,13736,5393,6178
174698,11561,2911,2680,11561,1959,5785,9534,6563,5466,6058


In [None]:
"""
submission_df = pd.DataFrame(preds)
submission_df
"""

Unnamed: 0,0,1
0,0,"[3560, 1959, 4545, 9534, 3560, 1959, 6563, 442..."
1,1,"[11984, 143, 11984, 4066, 6555, 8108, 7014, 61..."
2,2,"[757, 8922, 757, 7710, 9190, 410, 8922, 9910, ..."
3,3,"[13610, 12341, 12341, 3359, 1542, 13610, 6991,..."
4,4,"[4621, 9020, 2862, 10826, 4621, 1448, 6161, 61..."
...,...,...
995,995,"[2355, 9623, 1448, 2179, 854, 13235, 5411, 121..."
996,996,"[1405, 3954, 7893, 3954, 7893, 3754, 1405, 761..."
997,997,"[13116, 423, 13116, 8809, 364, 9938, 1085, 102..."
998,998,"[7861, 13198, 523, 7861, 13058, 8335, 3477, 62..."


In [None]:
# submission_df[1]

0      [3560, 1959, 4545, 9534, 3560, 1959, 6563, 442...
1      [11984, 143, 11984, 4066, 6555, 8108, 7014, 61...
2      [757, 8922, 757, 7710, 9190, 410, 8922, 9910, ...
3      [13610, 12341, 12341, 3359, 1542, 13610, 6991,...
4      [4621, 9020, 2862, 10826, 4621, 1448, 6161, 61...
                             ...                        
995    [2355, 9623, 1448, 2179, 854, 13235, 5411, 121...
996    [1405, 3954, 7893, 3954, 7893, 3754, 1405, 761...
997    [13116, 423, 13116, 8809, 364, 9938, 1085, 102...
998    [7861, 13198, 523, 7861, 13058, 8335, 3477, 62...
999    [3347, 5898, 1, 9185, 9175, 9176, 9177, 9178, ...
Name: 1, Length: 1000, dtype: object

In [None]:
"""
# 推薦リストを新しい列に展開する
expanded_df = submission_df[1].apply(pd.Series)
# 新しい列名を設定する
expanded_df.columns = [f'predict_{i}' for i in range(expanded_df.shape[1])]
# 最初の列にセッションIDを追加する
# expanded_df.insert(0, 'session_id', submission_df.index)
# 展開されたDataFrameを表示
expanded_df.head()
"""

Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9
0,3560,1959,4545,9534,3560,1959,6563,4420,5785,5466
1,11984,143,11984,4066,6555,8108,7014,613,11237,6129
2,757,8922,757,7710,9190,410,8922,9910,1774,10485
3,13610,12341,12341,3359,1542,13610,6991,10861,4180,13521
4,4621,9020,2862,10826,4621,1448,6161,6126,9623,11480


In [None]:
"""
submission_df = pd.DataFrame(preds)
submission_df
submission_df[1]

# 推薦リストを新しい列に展開する
expanded_df = submission_df[1].apply(pd.Series)
# 新しい列名を設定する
expanded_df.columns = [f'predict_{i}' for i in range(expanded_df.shape[1])]
# 最初の列にセッションIDを追加する
# expanded_df.insert(0, 'session_id', submission_df.index)
# 展開されたDataFrameを表示
expanded_df.head()
"""

Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9
0,3560,1959,4545,9534,6563,4420,5785,5466,6488,7461
1,11984,143,4066,6555,8108,7014,613,11237,6129,12862
2,757,8922,7710,9190,410,9910,1774,10485,3400,10104
3,13610,12341,3359,1542,6991,10861,4180,13521,6489,9319
4,4621,9020,2862,10826,1448,6161,6126,9623,11480,3854


In [None]:
submission_df = pd.DataFrame(preds)
submission_df
submission_df[1]

# 推薦リストを新しい列に展開する
expanded_df = submission_df[1].apply(pd.Series)
# 新しい列名を設定する
expanded_df.columns = [f'predict_{i}' for i in range(expanded_df.shape[1])]
# 最初の列にセッションIDを追加する
# expanded_df.insert(0, 'session_id', submission_df.index)
# 展開されたDataFrameを表示
expanded_df.head()

Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9
0,3560,1959,4545,9534,6563,4420,5785,5466,6488,7461
1,11984,143,4066,6555,8108,7014,613,11237,6129,12862
2,757,8922,7710,9190,410,9910,1774,10485,3400,10104
3,13610,12341,3359,1542,6991,10861,4180,13521,6489,9319
4,4621,9020,2862,10826,1448,6161,6126,9623,11480,3854


In [None]:
expanded_df

Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9
0,3560,1959,4545,9534,6563,4420,5785,5466,6488,7461
1,11984,143,4066,6555,8108,7014,613,11237,6129,12862
2,757,8922,7710,9190,410,9910,1774,10485,3400,10104
3,13610,12341,3359,1542,6991,10861,4180,13521,6489,9319
4,4621,9020,2862,10826,1448,6161,6126,9623,11480,3854
...,...,...,...,...,...,...,...,...,...,...
174695,1997,8336,1885,7888,11123,8771,7641,831,2278,6282
174696,12062,12432,899,13220,4014,3802,1227,2164,448,9723
174697,8989,13241,13797,1,9186,9176,9177,9178,9180,9181
174698,6030,3100,13672,2373,3002,5513,1687,4976,12281,10287


In [None]:
test_log_df # 参考

Unnamed: 0,session_id,seq_no,yad_no
0,00001149e9c73985425197104712478c,0,3560
1,00001149e9c73985425197104712478c,1,1959
2,0000e02747d749a52b7736dfa751e258,0,11984
3,0000f17ae2628237d78d3a38b009d3be,0,757
4,0000f17ae2628237d78d3a38b009d3be,1,8922
...,...,...,...
250300,fffee3199ef94b92283239cd5e3534fa,1,8336
250301,ffff62c6bb49bc9c0fbcf08494a4869c,0,12062
250302,ffff9a7dcc892875c7a8b821fa436228,0,8989
250303,ffffb1d30300fe17f661941fd085b04b,0,6030


In [None]:
# 3338の数を調べたい
# DataFrame全体で特定の数字を含む行の数を数える
count = (expanded_df == 3338).any(axis=1).sum()

print(f"3338を含む行の数: {count}")

3338を含む行の数: 6851


In [None]:
# 各行内で重複をチェックする関数の定義
def check_row_duplicates(row):
    # setを用いてユニークな要素のみを抽出し、その長さが元の行の長さと等しいかを比較
    return len(row) != len(set(row))

# check_row_duplicates関数を各行に適用し、結果を新しい列に格納
expanded_df['has_duplicates'] = expanded_df.apply(check_row_duplicates, axis=1)

# 重複がある行のみをフィルタリング
rows_with_duplicates = expanded_df[expanded_df['has_duplicates']]

# 結果を表示
print(len(rows_with_duplicates))

0


In [3]:
expanded_df.drop(columns=['has_duplicates']).to_csv('../submissions/candidate_ver1_1.csv', index=False)

In [2]:
"""
import pandas as pd
expanded_df = pd.read_csv('../submissions/candidate_ver1.csv')
expanded_df
"""

Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9,has_duplicates
0,3560,1959,4545,9534,6563,4420,5785,5466,6488,7461,False
1,11984,143,4066,6555,8108,7014,613,11237,6129,12862,False
2,757,8922,7710,9190,410,9910,1774,10485,3400,10104,False
3,13610,12341,3359,1542,6991,10861,4180,13521,6489,9319,False
4,4621,9020,2862,10826,1448,6161,6126,9623,11480,3854,False
...,...,...,...,...,...,...,...,...,...,...,...
174695,1997,8336,1885,7888,11123,8771,7641,831,2278,6282,False
174696,12062,12432,899,13220,4014,3802,1227,2164,448,9723,False
174697,8989,13241,13797,1,9186,9176,9177,9178,9180,9181,False
174698,6030,3100,13672,2373,3002,5513,1687,4976,12281,10287,False


In [27]:
expanded_df

Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9,has_duplicates
0,3560,1959,4545,9534,6563,4420,5785,5466,6488,7461,False
1,11984,143,4066,6555,8108,7014,613,11237,6129,12862,False
2,757,8922,7710,9190,410,9910,1774,10485,3400,10104,False
3,13610,12341,3359,1542,6991,10861,4180,13521,6489,9319,False
4,4621,9020,2862,10826,1448,6161,6126,9623,11480,3854,False
...,...,...,...,...,...,...,...,...,...,...,...
174695,1997,8336,1885,7888,11123,8771,7641,831,2278,6282,False
174696,12062,12432,899,13220,4014,3802,1227,2164,448,9723,False
174697,8989,13241,13797,1,9186,9176,9177,9178,9180,9181,False
174698,6030,3100,13672,2373,3002,5513,1687,4976,12281,10287,False


In [26]:
sample_submission_df 

Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9
0,12217,5055,4744,3410,11442,3711,8190,9998,13023,9297
1,12340,13720,1378,681,2020,11125,10129,4159,8184,7966
2,196,8757,6241,7376,13608,1693,13745,8065,13022,303
3,6448,3482,5687,2335,911,9008,8209,595,8171,10040
4,6327,1606,12898,3785,8212,910,10430,4260,9767,383
...,...,...,...,...,...,...,...,...,...,...
174695,10343,11384,5571,2011,3762,12692,6599,8738,8343,6440
174696,13609,2644,11510,5711,13054,9903,13750,8638,1892,5086
174697,11560,1862,11248,63,12446,9975,13567,7793,5306,9678
174698,7001,4154,11586,3307,13219,12310,5815,5103,510,1351


In [37]:
preds

[[3560, 1959, 4545, 9534, 3560, 1959, 6563, 4420, 5785, 5466]]