In [1]:
import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
from itertools import combinations
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed

In [2]:
train_log_df = pd.read_csv('../train_log.csv')
test_log_df = pd.read_csv('../test_log.csv')
train_label_df = pd.read_csv('../train_label.csv')
test_session_df = pd.read_csv('../test_session.csv')
yado_df = pd.read_csv('../yado.csv')
image_embeddings_df = pd.read_parquet('../image_embeddings.parquet')
sample_submission_df = pd.read_csv('../sample_submission.csv')

In [3]:
train_log_df

Unnamed: 0,session_id,seq_no,yad_no
0,000007603d533d30453cc45d0f3d119f,0,2395
1,0000ca043ed437a1472c9d1d154eb49b,0,13535
2,0000d4835cf113316fe447e2f80ba1c8,0,123
3,0000fcda1ae1b2f431e55a7075d1f500,0,8475
4,000104bdffaaad1a1e0a9ebacf585f33,0,96
...,...,...,...
419265,ffffcd5bc19d62cad5a3815c87818d83,0,12230
419266,ffffcd5bc19d62cad5a3815c87818d83,1,10619
419267,ffffcd5bc19d62cad5a3815c87818d83,2,12230
419268,fffffa7baf370083ebcdd98f26a7e31a,0,2439


In [4]:
train_label_df

Unnamed: 0,session_id,yad_no
0,000007603d533d30453cc45d0f3d119f,4101
1,0000ca043ed437a1472c9d1d154eb49b,8253
2,0000d4835cf113316fe447e2f80ba1c8,4863
3,0000fcda1ae1b2f431e55a7075d1f500,1652
4,000104bdffaaad1a1e0a9ebacf585f33,96
...,...,...
288693,ffff2262d38abdeb247ebd591835dcc9,2259
288694,ffff2360540745117193ecadcdc06538,963
288695,ffff7fb4617164b2604aaf51c40bf82d,13719
288696,ffffcd5bc19d62cad5a3815c87818d83,10619


In [5]:
test_log_df

Unnamed: 0,session_id,seq_no,yad_no
0,00001149e9c73985425197104712478c,0,3560
1,00001149e9c73985425197104712478c,1,1959
2,0000e02747d749a52b7736dfa751e258,0,11984
3,0000f17ae2628237d78d3a38b009d3be,0,757
4,0000f17ae2628237d78d3a38b009d3be,1,8922
...,...,...,...
250300,fffee3199ef94b92283239cd5e3534fa,1,8336
250301,ffff62c6bb49bc9c0fbcf08494a4869c,0,12062
250302,ffff9a7dcc892875c7a8b821fa436228,0,8989
250303,ffffb1d30300fe17f661941fd085b04b,0,6030


In [6]:
pd.concat([train_log_df, train_label_df, test_log_df], axis=0, ignore_index=True)['yad_no'].value_counts()

12350    2139
3338     2068
10095    1918
719      1819
13468    1657
         ... 
8761        1
11678       1
5458        1
13390       1
5382        1
Name: yad_no, Length: 13806, dtype: int64

In [7]:
pd.concat([train_log_df, test_log_df], axis=0, ignore_index=True)['yad_no'].value_counts()

12350    1606
719      1520
3338     1492
13468    1373
10095    1313
         ... 
9487        1
6437        1
9642        1
9976        1
9348        1
Name: yad_no, Length: 13562, dtype: int64

In [8]:
import pandas as pd
from itertools import combinations, chain
from tqdm import tqdm

def create_extended_covisitation_matrix(combined_log_df):
    # セッションごとに訪問された宿の拡張組み合わせを生成
    extended_pairs = []
    for session_id, group in tqdm(combined_log_df.groupby('session_id'), desc="Processing sessions"):
        accommodations = group['yad_no'].tolist()
        direct_pairs = list(combinations(accommodations, 2))

        # 直接のペアに加えて、各宿が関連する他の宿も含める
        indirect_pairs = []
        for acc in accommodations:
            related_accommodations = [pair for pair in direct_pairs if acc in pair]
            related_accommodations = list(set(chain(*related_accommodations)) - {acc})
            for rel_acc in related_accommodations:
                indirect_pairs.append((acc, rel_acc))
        
        extended_pairs.extend(direct_pairs + indirect_pairs)

    # 共訪問のカウント
    pairs_df = pd.DataFrame(extended_pairs, columns=['yad_no1', 'yad_no2'])
    covisitation_counts = pairs_df.groupby(['yad_no1', 'yad_no2']).size().reset_index(name='count')

    # 行列の作成
    covisitation_matrix = covisitation_counts.pivot(index='yad_no1', columns='yad_no2', values='count').fillna(0)

    return covisitation_matrix


# train_log_df と train_label_dfとtest_log_df を結合
combined_log_df = pd.concat([train_log_df, train_label_df, test_log_df], axis=0, ignore_index=True)
# 共訪問行列の作成
covisitation_matrix = create_extended_covisitation_matrix(combined_log_df)
covisitation_matrix

Processing sessions: 100%|██████████| 463398/463398 [00:30<00:00, 15005.65it/s]


yad_no2,1,2,3,4,5,6,7,8,9,10,...,13797,13798,13799,13800,13801,13802,13803,13804,13805,13806
yad_no1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,53.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13803,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0
13804,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,0.0,0.0
13805,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
"""
# trainとtestをconcatした共訪問行列の作成
def create_covisitation_matrix(combined_log_df):
    # セッションごとに訪問された宿の組み合わせを生成
    pairs = []
    for session_id, group in tqdm(combined_log_df.groupby('session_id'), desc="Processing sessions"):
        accommodations = group['yad_no'].tolist()
        for pair in combinations(accommodations, 2):
            pairs.append(pair)

    # 共訪問のカウント
    pairs_df = pd.DataFrame(pairs, columns=['yad_no1', 'yad_no2'])
    covisitation_counts = pairs_df.groupby(['yad_no1', 'yad_no2']).size().reset_index(name='count')

    # 行列の作成
    covisitation_matrix = covisitation_counts.pivot(index='yad_no1', columns='yad_no2', values='count').fillna(0)

    return covisitation_matrix

# train_log_df と train_label_dfとtest_log_df を結合
combined_log_df = pd.concat([train_log_df, train_label_df, test_log_df], axis=0, ignore_index=True)

# 共訪問行列の作成
covisitation_matrix = create_covisitation_matrix(combined_log_df)
covisitation_matrix
"""

Processing sessions: 100%|██████████| 463398/463398 [00:28<00:00, 16232.12it/s]


yad_no2,1,2,3,4,5,6,7,8,9,10,...,13797,13798,13799,13800,13801,13802,13803,13804,13805,13806
yad_no1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,53.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13801,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,53.0,0.0,0.0,0.0,0.0,0.0
13803,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0
13804,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,0.0,0.0
13805,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# ステップ 1: 指定セッションの宿を抽出
def get_session_accommodations(session_id, log_df):
    # セッションIDに一致するレコードをフィルタリング
    session_records = log_df[log_df['session_id'] == session_id]
    # セッション内の宿のリストを取得
    yado_list = session_records['yad_no'].tolist()
    # 最後に訪問された宿を保持
    last_yado = yado_list[-1] if yado_list else None
    # 重複を排除（ただし最後に訪問された宿は保持）
    unique_yado = pd.Series(yado_list).drop_duplicates().tolist()

    return last_yado, unique_yado

# 例：特定のsession_idで出現した宿（重複なし）と、そのセッションの最後に現れた宿
last_yado, unique_session_yado = get_session_accommodations(session_id='ffffcd5bc19d62cad5a3815c87818d83', log_df=train_log_df)
print(last_yado)
print(unique_session_yado)

12230
[12230, 10619]


In [10]:
# ステップ 2: 訪問行列に基づく推薦
def get_recommendations_from_covisitation(yad_no, covisitation_matrix, exclude_list=None, num_recommendations=50):
    # 特定の宿に関連する宿の共訪問回数を取得
    related_accommodations = covisitation_matrix.loc[yad_no].sort_values(ascending=False)
    
    # 除外リストを考慮してトップの推薦を選出
    top_recommendations = []
    for accommodation in related_accommodations.index:
        if accommodation not in exclude_list:
            top_recommendations.append(accommodation)
            if len(top_recommendations) == num_recommendations:
                break
    
    return top_recommendations

# 例: 宿IDが333の宿に対する推薦を取得（covisitation_matrixは前のステップで作成されたものとする）
recommendations = get_recommendations_from_covisitation(333, covisitation_matrix, exclude_list=[], num_recommendations=50)
print(recommendations)
print(len(recommendations))

[3370, 969, 5454, 4029, 6927, 333, 1, 9208, 9200, 9201, 9202, 9203, 9204, 9205, 9206, 9207, 9210, 9209, 9198, 9211, 9212, 9213, 9214, 9215, 9216, 9217, 9218, 9219, 9220, 9199, 9196, 9197, 9184, 9174, 9175, 9176, 9177, 9178, 9179, 9180, 9181, 9182, 9183, 9185, 9222, 9186, 9187, 9188, 9189, 9190]
50


In [11]:
# ステップ3：各セッションの最頻値sml_cdの人気順に基づく補完

# train_log_df と test_log_df を結合
combined_log_df = pd.concat([train_log_df, test_log_df], axis=0, ignore_index=True)


# 結合したデータセットとyadoをyad_noをキーにして結合
combined_log_extended = pd.merge(combined_log_df, yado_df, on='yad_no', how='left') 
# 結合したデータセットをsession_idとsml_cdでグループ化し、出現回数をカウント
sml_cd_count = combined_log_extended.groupby(['session_id', 'sml_cd']).size().reset_index(name='count')
# 各session_idに対して最も出現頻度の高いsml_cdを選択
most_frequent_sml_cd = sml_cd_count.sort_values(['session_id', 'count'], ascending=[True, False])\
                                   .drop_duplicates('session_id')


# 結合したデータセットとyadoをyad_noをキーにして結合
combined_logs_extended = pd.merge(combined_log_df, yado_df, on='yad_no', how='left')
# 結合したデータセットをsml_cdとyad_noでグループ化し、出現回数をカウント
yad_count = combined_logs_extended.groupby(['sml_cd', 'yad_no']).size().reset_index(name='count')
# 各sml_cdに対して最も出現回数が多い宿（人気の宿）を選択
popular_yads_by_sml_cd = yad_count.sort_values(['sml_cd', 'count'], ascending=[True, False])\
                                  .groupby('sml_cd').head(100)

# ここでは人気の宿を50件選択する
n = 50
# 各sml_cdの上位n件の人気宿を選択
top_yads_by_sml_cd = yad_count.sort_values(['sml_cd', 'count'], ascending=[True, False])\
                              .groupby('sml_cd').head(n)
# most_frequent_sml_cd（ステップ1の結果）と結合して、各session_idに対応するsml_cdの人気宿を取得
recommendations = pd.merge(most_frequent_sml_cd[['session_id', 'sml_cd']], 
                           top_yads_by_sml_cd, 
                           on='sml_cd', 
                           how='left')
# session_idごとに人気宿をリスト化
recommendations_grouped = recommendations.groupby('session_id')['yad_no'].apply(list).reset_index()



def get_popular_accommodations_by_sml_cd(session_id, exclude_list=None, num_recommendations=30):
    # session_idに対応するsml_cdを取得
    sml_cd = most_frequent_sml_cd[most_frequent_sml_cd['session_id'] == session_id]['sml_cd'].values[0]
    # sml_cdに対応する人気宿を取得
    popular_accommodations = popular_yads_by_sml_cd[popular_yads_by_sml_cd['sml_cd'] == sml_cd]['yad_no'].values.tolist()
    # 重複を除外
    recommendations = [yad_no for yad_no in popular_accommodations if yad_no not in exclude_list]
    # トップの推薦を選出
    top_recommendations = recommendations[:num_recommendations]
    return top_recommendations


# 例
exclude_list = [11101, 11134, 8927, 4391, 7157]  # 既に選ばれた宿のIDリスト
A = get_popular_accommodations_by_sml_cd(session_id='000007603d533d30453cc45d0f3d119f', exclude_list=exclude_list, num_recommendations=30)
print(A)
print(len(A))

[997, 5821, 12837, 12846, 1648, 13544, 7281, 2995, 3922, 1168, 2528, 2808, 2395, 4101, 11368, 7448, 11882, 11996, 2008, 9056, 10747, 11155, 7374, 10994, 13697, 3324, 12342, 2134, 5289, 3073]
30


In [12]:
recommendations_grouped

Unnamed: 0,session_id,yad_no
0,000007603d533d30453cc45d0f3d119f,"[11101, 11134, 8927, 4391, 7157, 997, 5821, 12..."
1,00001149e9c73985425197104712478c,"[11561, 9534, 1959, 2680, 5466, 6563, 10233, 9..."
2,0000ca043ed437a1472c9d1d154eb49b,"[3184, 10818, 8140, 12812, 13079, 8677, 10955,..."
3,0000d4835cf113316fe447e2f80ba1c8,"[918, 13642, 9039, 10170, 2957, 7257, 9854, 11..."
4,0000e02747d749a52b7736dfa751e258,"[143, 4066, 6129, 11984, 4825, 11237, 11923, 6..."
...,...,...
463393,ffff9a7dcc892875c7a8b821fa436228,"[11037, 6199, 2927, 12089, 12425, 12132, 12986..."
463394,ffffb1d30300fe17f661941fd085b04b,"[3100, 11496, 10287, 3002, 2305, 6378, 2692, 2..."
463395,ffffcd5bc19d62cad5a3815c87818d83,"[12230, 570, 11316, 12500, 7551, 3238, 12829, ..."
463396,ffffe984aafd6127ce8e43e3ca40c79d,"[12767, 12620, 8250, 8648, 7690, 867, 7017, 75..."


In [13]:
# ステップ4：全体の人気順に基づく補完

# train_log_df と test_log_df を結合
combined_log_df = pd.concat([train_log_df, test_log_df], axis=0, ignore_index=True)
# 各宿の訪問回数をカウント
popularity_counts = combined_log_df['yad_no'].value_counts()
# 重複を除外
popular_accommodations = popularity_counts.index.tolist()

def get_popular_accommodations(exclude_list=None, num_recommendations=50):
    recommendations = [yad_no for yad_no in popular_accommodations if yad_no not in exclude_list]
    
    # トップの推薦を選出
    top_recommendations = recommendations[:num_recommendations]
    return top_recommendations

exclude_list = [12350, 719, 3338, 13468, 10095]
additional_recommendations = get_popular_accommodations(exclude_list=exclude_list, num_recommendations=50)
print(additional_recommendations)
print(len(additional_recommendations))

[8567, 532, 8553, 2201, 915, 3848, 11037, 12017, 385, 1818, 4913, 2797, 6199, 6470, 5116, 2927, 693, 10118, 9020, 9104, 8445, 11398, 12089, 13402, 10175, 10418, 10827, 3077, 5607, 755, 1091, 7093, 11499, 9248, 13017, 12425, 6407, 9830, 5754, 13292, 2087, 2445, 11407, 5445, 6178, 1390, 10350, 5135, 307, 11850]
50


### 学習用の候補テーブル作成

In [14]:
# train_log_df と test_log_df を結合
# combined_log_df = pd.concat([train_log_df, test_log_df], axis=0, ignore_index=True)


# セッションIDごとに yad_no のリストを作成
session_yadno_map = train_log_df.groupby('session_id')['yad_no'].apply(list).to_dict() #ここをtrain, testで変える  train_log_df, test_log_df


def generate_recommendation(i):
    
    # 推薦する宿を格納するリスト
    pred_recommendations = []
    
    # 指定セッションの宿を抽出
    session_id = train_label_df.loc[i, 'session_id']  #ここをtrain, testで変える  train_label_df, test_session_df
    yado_no = session_yadno_map.get(session_id, [])
    last_yado = yado_no[-1] if yado_no else None
    yado_no = pd.Series(yado_no).drop_duplicates().tolist()
    
    pred_recommendations.extend(yado_no) # 最後に訪問された宿を除外
    if last_yado in pred_recommendations:
        pred_recommendations.remove(last_yado)
    
    # 訪問行列に基づく推薦(何件抽出するのかみたいかも)　
    for y in yado_no:
        try:
            yado_covisit = get_recommendations_from_covisitation(y, covisitation_matrix, exclude_list=pred_recommendations + [last_yado], num_recommendations=30)
            pred_recommendations.extend(yado_covisit)
        except:
            pass

    # 各セッションの最頻値sml_cdの人気順に基づく補完(何件抽出するのかみたいかも)　デフォルトで30件
    # sml_cd_recommendations = get_popular_accommodations_by_sml_cd(session_id=session_id, exclude_list=pred_recommendations + [last_yado], num_recommendations=30)
    # pred_recommendations.extend(sml_cd_recommendations)
    
    # 全体の人気順に基づく補完 デフォルトで50件
    # additional_recommendations = get_popular_accommodations(exclude_list=pred_recommendations + [last_yado], num_recommendations=50)
    # pred_recommendations.extend(additional_recommendations)
    
    return i, pred_recommendations[:10]


# マルチプロセス処理の実行
results = []
with ProcessPoolExecutor(max_workers=16) as executor: # ここのmax_workersは手持ちのCPUのスレッド数によって適宜変える
    future_to_session = {executor.submit(generate_recommendation, i): i for i in range(len(train_label_df))}
    for future in tqdm(as_completed(future_to_session), total=len(train_label_df), desc="Processing recommendations"):
        index = future_to_session[future]
        result = future.result()
        results.append((index, result))

results.sort(key=lambda x: x[0])
preds = [result for i, result in results]

Processing recommendations: 100%|██████████| 288698/288698 [01:27<00:00, 3315.11it/s]


In [15]:
submission_df = pd.DataFrame(preds)
submission_df
submission_df[1]

# 推薦リストを新しい列に展開する
result_df = submission_df[1].apply(pd.Series)
# 新しい列名を設定する
result_df.columns = [f'predict_{i}' for i in range(result_df.shape[1])]
# 最初の列にセッションIDを追加する
# result_df.insert(0, 'session_id', submission_df.index)
# 展開されたDataFrameを表示
result_df

Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9
0,11882,2808,5289,4101,3324,12846,997,9207,9209,9208
1,8253,8747,2570,1586,11104,3725,4488,2259,3564,8225
2,9039,6722,7509,4355,4863,11724,5238,13642,1967,2957
3,626,755,11715,7812,109,2272,13296,1341,13549,7872
4,96,902,12491,5490,1284,11380,3894,7749,12338,4072
...,...,...,...,...,...,...,...,...,...,...
288693,13210,13079,2876,3725,8677,13717,1586,10955,10522,5719
288694,8703,3940,399,4767,2900,6654,5299,8465,963,513
288695,7308,12240,4040,7820,4398,2087,9558,3566,844,10364
288696,10619,570,12500,11091,7551,3238,10616,12781,12829,11316


In [16]:
y_true = train_label_df['yad_no'].values.reshape(-1, 1)
y_pred = result_df.values

In [17]:
# Recall(候補にあげたものが)のy_trueに含まれているものの割合

# マッチするかどうかをチェックするリストを初期化
matches = []

# y_pred の各サブアレイと y_true の値を比較
for pred, true in zip(y_pred, y_true):
    # pred はサブアレイ、true は値
    matches.append(int(true in pred))

# matches は、指定された条件に基づいて 0 と 1 のリストです
print(matches)

# matches は先のステップで生成した 0 と 1 のリストです。
# 1 の割合を計算
one_ratio = sum(matches) / len(matches)

# 割合をプリント
print("Recall is: {:.2%}".format(one_ratio))

[1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 

In [18]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the AP at k between two lists of items.
    """
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean AP at k between two lists of list of items.
    """
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

# y_true と y_pred をリストに変換
y_true_list = [list(x) for x in y_true]
y_pred_list = [list(x) for x in y_pred]

# MAP@10 を計算
map_at_10 = mapk(y_true_list, y_pred_list, k=10)
print(f'Map@10  {map_at_10}')

Map@10  0.4224232506600241


In [19]:
# 各行内で重複があるかどうかをチェック
def check_row_duplicates(row):
    # setを用いてユニークな要素のみを抽出し、その長さが元の行の長さと等しいかを比較
    return len(row) != len(set(row))
# check_row_duplicates関数を各行に適用し、結果を新しい列に格納
result_df['has_duplicates'] = result_df.apply(check_row_duplicates, axis=1)
# 重複がある行のみをフィルタリング
rows_with_duplicates = result_df[result_df['has_duplicates']]
# 結果を表示
print(len(rows_with_duplicates))

0


In [20]:
# 欠損がないかをチェック
result_df.isnull().sum(axis=1).sum()

0

In [21]:
result_df.head()

Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9,has_duplicates
0,11882,2808,5289,4101,3324,12846,997,9207,9209,9208,False
1,8253,8747,2570,1586,11104,3725,4488,2259,3564,8225,False
2,9039,6722,7509,4355,4863,11724,5238,13642,1967,2957,False
3,626,755,11715,7812,109,2272,13296,1341,13549,7872,False
4,96,902,12491,5490,1284,11380,3894,7749,12338,4072,False


In [30]:
# result_df.drop(columns=['has_duplicates']).to_csv('../data/candidate_ver8_train.csv', index=False)

### 推論用の候補テーブル作成

In [22]:
# train_log_df と test_log_df を結合
# combined_log_df = pd.concat([train_log_df, test_log_df], axis=0, ignore_index=True)



# セッションIDごとに yad_no のリストを作成
session_yadno_map = test_log_df.groupby('session_id')['yad_no'].apply(list).to_dict() #ここをtrain, testで変える  train_log_df, test_log_df


def generate_recommendation(i):
    
    # 推薦する宿を格納するリスト
    pred_recommendations = []
    
    # 指定セッションの宿を抽出
    session_id = test_session_df.loc[i, 'session_id']  #ここをtrain, testで変える  train_label_df, test_session_df
    yado_no = session_yadno_map.get(session_id, [])
    last_yado = yado_no[-1] if yado_no else None
    yado_no = pd.Series(yado_no).drop_duplicates().tolist()
    
    pred_recommendations.extend(yado_no) # 最後に訪問された宿を除外
    if last_yado in pred_recommendations:
        pred_recommendations.remove(last_yado)
    
    # 訪問行列に基づく推薦(何件抽出するのかみたいかも)
    for y in yado_no:
        try:
            yado_covisit = get_recommendations_from_covisitation(y, covisitation_matrix, exclude_list=pred_recommendations + [last_yado], num_recommendations=30)
            pred_recommendations.extend(yado_covisit)
        except:
            pass

    # 各セッションの最頻値sml_cdの人気順に基づく補完(何件抽出するのかみたいかも) # 何もないと欠損が生じる
    sml_cd_recommendations = get_popular_accommodations_by_sml_cd(session_id=session_id, exclude_list=pred_recommendations + [last_yado], num_recommendations=30)
    pred_recommendations.extend(sml_cd_recommendations)
    
    # 全体の人気順に基づく補完 # 何もないと欠損が生じる # 次に大きいエリアの人気順の補完にした方がいいかも
    additional_recommendations = get_popular_accommodations(exclude_list=pred_recommendations + [last_yado], num_recommendations=30)
    pred_recommendations.extend(additional_recommendations)
    
    return i, pred_recommendations[:10]


# マルチプロセス処理の実行
results = []
with ProcessPoolExecutor(max_workers=16) as executor:
    future_to_session = {executor.submit(generate_recommendation, i): i for i in range(len(test_session_df))} #ここをtrain, testで変える  train_label_df, test_session_df
    for future in tqdm(as_completed(future_to_session), total=len(test_session_df), desc="Processing recommendations"): #ここをtrain, testで変える  train_label_df, test_session_df
        index = future_to_session[future]
        result = future.result()
        results.append((index, result))

results.sort(key=lambda x: x[0])
preds = [result for i, result in results]

Processing recommendations: 100%|██████████| 174700/174700 [16:28<00:00, 176.77it/s]


In [23]:
submission_df = pd.DataFrame(preds)
submission_df
submission_df[1]

# 推薦リストを新しい列に展開する
test_expanded_df = submission_df[1].apply(pd.Series)
# 新しい列名を設定する
test_expanded_df.columns = [f'predict_{i}' for i in range(test_expanded_df.shape[1])]
# 最初の列にセッションIDを追加する
# expanded_df.insert(0, 'session_id', submission_df.index)
# 展開されたDataFrameを表示
test_expanded_df

Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9
0,3560,4545,9534,11561,5785,6563,4420,2680,4714,6488
1,143,4066,6555,7014,11923,8108,6129,613,11237,12862
2,757,9190,410,7710,9910,1774,3400,10485,6721,10104
3,12341,3359,6991,1542,13521,5080,10861,4180,10746,6489
4,9020,2862,10826,12029,3854,3476,9623,3844,5372,6161
...,...,...,...,...,...,...,...,...,...,...
174695,1997,7888,1885,11123,8771,7641,831,5744,10997,9543
174696,13220,12432,899,4014,3802,1227,3644,2164,13702,9723
174697,13241,13797,13719,12939,7308,2087,8143,11796,11037,5810
174698,3100,10287,3002,2373,12281,4976,13672,2692,5513,1687


In [24]:
# 各行内で重複があるかどうかをチェック

def check_row_duplicates(row):
    # setを用いてユニークな要素のみを抽出し、その長さが元の行の長さと等しいかを比較
    return len(row) != len(set(row))
# check_row_duplicates関数を各行に適用し、結果を新しい列に格納
test_expanded_df['has_duplicates'] = test_expanded_df.apply(check_row_duplicates, axis=1)
# 重複がある行のみをフィルタリング
rows_with_duplicates = test_expanded_df[test_expanded_df['has_duplicates']]
# 結果を表示
print(len(rows_with_duplicates))

0


In [25]:
# 欠損がないかをチェック
test_expanded_df.isnull().sum(axis=1).sum()

0

In [27]:
test_expanded_df

Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9,has_duplicates
0,3560,4545,9534,11561,5785,6563,4420,2680,4714,6488,False
1,143,4066,6555,7014,11923,8108,6129,613,11237,12862,False
2,757,9190,410,7710,9910,1774,3400,10485,6721,10104,False
3,12341,3359,6991,1542,13521,5080,10861,4180,10746,6489,False
4,9020,2862,10826,12029,3854,3476,9623,3844,5372,6161,False
...,...,...,...,...,...,...,...,...,...,...,...
174695,1997,7888,1885,11123,8771,7641,831,5744,10997,9543,False
174696,13220,12432,899,4014,3802,1227,3644,2164,13702,9723,False
174697,13241,13797,13719,12939,7308,2087,8143,11796,11037,5810,False
174698,3100,10287,3002,2373,12281,4976,13672,2692,5513,1687,False


In [28]:
test_expanded_df.drop(columns=['has_duplicates']).to_csv('../submissions/candidate_ver11.csv', index=False)

In [29]:
# test_expanded_df.drop(columns=['has_duplicates']).to_csv('../data/candidate_ver8_test.csv', index=False)