In [1]:
import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
from itertools import combinations
from tqdm import tqdm
#from concurrent.futures import ThreadPoolExecutor, as_completed
from concurrent.futures import ProcessPoolExecutor, as_completed

In [2]:
train_log_df = pd.read_csv('../train_log.csv')
test_log_df = pd.read_csv('../test_log.csv')
train_label_df = pd.read_csv('../train_label.csv')
test_session_df = pd.read_csv('../test_session.csv')
yado_df = pd.read_csv('../yado.csv')
image_embeddings_df = pd.read_parquet('../image_embeddings.parquet')
sample_submission_df = pd.read_csv('../sample_submission.csv')

In [3]:
train_log_df

Unnamed: 0,session_id,seq_no,yad_no
0,000007603d533d30453cc45d0f3d119f,0,2395
1,0000ca043ed437a1472c9d1d154eb49b,0,13535
2,0000d4835cf113316fe447e2f80ba1c8,0,123
3,0000fcda1ae1b2f431e55a7075d1f500,0,8475
4,000104bdffaaad1a1e0a9ebacf585f33,0,96
...,...,...,...
419265,ffffcd5bc19d62cad5a3815c87818d83,0,12230
419266,ffffcd5bc19d62cad5a3815c87818d83,1,10619
419267,ffffcd5bc19d62cad5a3815c87818d83,2,12230
419268,fffffa7baf370083ebcdd98f26a7e31a,0,2439


In [4]:
pd.concat([train_log_df, test_log_df], axis=0, ignore_index=True)['yad_no'].value_counts()

12350    1606
719      1520
3338     1492
13468    1373
10095    1313
         ... 
9487        1
6437        1
9642        1
9976        1
9348        1
Name: yad_no, Length: 13562, dtype: int64

In [5]:
# trainとtestをconcatした共訪問行列の作成
def create_covisitation_matrix(combined_log_df):
    # セッションごとに訪問された宿の組み合わせを生成
    pairs = []
    for session_id, group in tqdm(combined_log_df.groupby('session_id'), desc="Processing sessions"):
        accommodations = group['yad_no'].tolist()
        for pair in combinations(accommodations, 2):
            pairs.append(pair)

    # 共訪問のカウント
    pairs_df = pd.DataFrame(pairs, columns=['yad_no1', 'yad_no2'])
    covisitation_counts = pairs_df.groupby(['yad_no1', 'yad_no2']).size().reset_index(name='count')

    # 行列の作成
    covisitation_matrix = covisitation_counts.pivot(index='yad_no1', columns='yad_no2', values='count').fillna(0)

    return covisitation_matrix

# train_log_df と test_log_df を結合
combined_log_df = pd.concat([train_log_df, test_log_df], axis=0, ignore_index=True)

# 共訪問行列の作成
covisitation_matrix = create_covisitation_matrix(combined_log_df)
covisitation_matrix

Processing sessions: 100%|██████████| 463398/463398 [00:28<00:00, 16339.71it/s]


yad_no2,1,2,3,4,5,7,9,10,12,15,...,13796,13797,13798,13799,13800,13801,13803,13804,13805,13806
yad_no1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0
13801,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0
13803,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
13804,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0


In [6]:
# ステップ 1: 指定セッションの宿を抽出
def get_session_accommodations(session_id, log_df):
    # セッションIDに一致するレコードをフィルタリング
    session_records = log_df[log_df['session_id'] == session_id]
    # セッション内の宿のリストを取得
    yado_list = session_records['yad_no'].tolist()
    # 最後に訪問された宿を保持
    last_yado = yado_list[-1] if yado_list else None
    # 重複を排除（ただし最後に訪問された宿は保持）
    unique_yado = pd.Series(yado_list).drop_duplicates().tolist()

    return last_yado, unique_yado

# 例：特定のsession_idで出現した宿（重複なし）と、そのセッションの最後に現れた宿
last_yado, unique_session_yado = get_session_accommodations(session_id='ffffcd5bc19d62cad5a3815c87818d83', log_df=train_log_df)
print(last_yado)
print(unique_session_yado)

12230
[12230, 10619]


In [7]:
# ステップ 2: 訪問行列に基づく推薦
def get_recommendations_from_covisitation(yad_no, covisitation_matrix, exclude_list=None, num_recommendations=10):
    # 特定の宿に関連する宿の共訪問回数を取得
    related_accommodations = covisitation_matrix.loc[yad_no].sort_values(ascending=False)
    
    # 除外リストを考慮してトップの推薦を選出
    top_recommendations = []
    for accommodation in related_accommodations.index:
        if accommodation not in exclude_list:
            top_recommendations.append(accommodation)
            if len(top_recommendations) == num_recommendations:
                break
    
    return top_recommendations

# 例: 宿IDが333の宿に対する推薦を取得（covisitation_matrixは前のステップで作成されたものとする）
recommendations = get_recommendations_from_covisitation(333, covisitation_matrix, exclude_list=[])
print(recommendations)

[3370, 1, 9187, 9177, 9178, 9180, 9181, 9182, 9183, 9185]


In [8]:
# ステップ3：全体の人気順に基づく補完
def get_popular_accommodations(log_df, exclude_list, num_recommendations=10):
    # 各宿の訪問回数をカウント
    popularity_counts = log_df['yad_no'].value_counts()

    # 重複を除外
    popular_accommodations = popularity_counts.index.tolist()
    recommendations = [yad_no for yad_no in popular_accommodations if yad_no not in exclude_list]

    # トップの推薦を選出
    top_recommendations = recommendations[:num_recommendations]
    return top_recommendations

# 例: 除外リスト（既に選ばれた候補）に基づいて追加の推薦を取得
# train_log_df と test_log_df を結合
combined_log_df = pd.concat([train_log_df, test_log_df], axis=0, ignore_index=True)
# exclude_list = [100, 101, 102]  # 既に選ばれた宿のIDリスト
additional_recommendations = get_popular_accommodations(log_df = combined_log_df, exclude_list=[], num_recommendations=10)
print(additional_recommendations)

[12350, 719, 3338, 13468, 10095, 8567, 532, 8553, 2201, 915]


In [9]:
# train_log_df と test_log_df を結合
combined_log_df = pd.concat([train_log_df, test_log_df], axis=0, ignore_index=True)

# セッションIDごとに yad_no のリストを作成
session_yadno_map = test_log_df.groupby('session_id')['yad_no'].apply(list).to_dict()

def generate_recommendation(i):
    pred_recommendations = []

    session_id = test_session_df.loc[i, 'session_id']
    yado_no = session_yadno_map.get(session_id, [])
    last_yado = yado_no[-1] if yado_no else None
    yado_no = pd.Series(yado_no).drop_duplicates().tolist()
    
    pred_recommendations.extend(yado_no)
    if last_yado in pred_recommendations:
        pred_recommendations.remove(last_yado)
    
    for y in yado_no:
        try:
            yado_covisit = get_recommendations_from_covisitation(y, covisitation_matrix, exclude_list=pred_recommendations + [last_yado])
            pred_recommendations.extend(yado_covisit)
        except:
            pass
    
    additional_recommendations = get_popular_accommodations(combined_log_df, exclude_list=pred_recommendations + [last_yado])
    pred_recommendations.extend(additional_recommendations)
    
    return i, pred_recommendations[:10]


# マルチプロセス処理の実行
results = []
with ProcessPoolExecutor(max_workers=16) as executor:
    future_to_session = {executor.submit(generate_recommendation, i): i for i in range(len(test_session_df))}
    for future in tqdm(as_completed(future_to_session), total=len(test_session_df), desc="Processing recommendations"):
        index = future_to_session[future]
        result = future.result()
        results.append((index, result))

results.sort(key=lambda x: x[0])
preds = [result for i, result in results]

Processing recommendations: 100%|██████████| 174700/174700 [05:28<00:00, 531.40it/s]


In [10]:
submission_df = pd.DataFrame(preds)
submission_df
submission_df[1]

# 推薦リストを新しい列に展開する
expanded_df = submission_df[1].apply(pd.Series)
# 新しい列名を設定する
expanded_df.columns = [f'predict_{i}' for i in range(expanded_df.shape[1])]
# 最初の列にセッションIDを追加する
# expanded_df.insert(0, 'session_id', submission_df.index)
# 展開されたDataFrameを表示
expanded_df.head()

Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9
0,3560,4545,9534,6563,4420,5785,5466,6488,7461,10233
1,143,4066,6555,8108,7014,613,11237,6129,12862,11923
2,757,7710,9190,410,9910,1774,10485,3400,10104,13570
3,12341,3359,1542,6991,10861,4180,13521,6489,9319,5657
4,9020,2862,10826,1448,6161,6126,9623,11480,3854,3476


In [11]:
# 12530の数を調べたい
# DataFrame全体で特定の数字を含む行の数を数える
count = (expanded_df == 12350).any(axis=1).sum()

print(f"12350を含む行の数: {count}")

12350を含む行の数: 6850


In [12]:
# 3338の数を調べたい
# DataFrame全体で特定の数字を含む行の数を数える
count = (expanded_df == 3338).any(axis=1).sum()

print(f"3338を含む行の数: {count}")

3338を含む行の数: 6934


In [13]:
# 各行内で重複をチェックする関数の定義
def check_row_duplicates(row):
    # setを用いてユニークな要素のみを抽出し、その長さが元の行の長さと等しいかを比較
    return len(row) != len(set(row))

# check_row_duplicates関数を各行に適用し、結果を新しい列に格納
expanded_df['has_duplicates'] = expanded_df.apply(check_row_duplicates, axis=1)

# 重複がある行のみをフィルタリング
rows_with_duplicates = expanded_df[expanded_df['has_duplicates']]

# 結果を表示
print(len(rows_with_duplicates))

0


In [14]:
expanded_df.head()

Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9,has_duplicates
0,3560,4545,9534,6563,4420,5785,5466,6488,7461,10233,False
1,143,4066,6555,8108,7014,613,11237,6129,12862,11923,False
2,757,7710,9190,410,9910,1774,10485,3400,10104,13570,False
3,12341,3359,1542,6991,10861,4180,13521,6489,9319,5657,False
4,9020,2862,10826,1448,6161,6126,9623,11480,3854,3476,False


In [15]:
expanded_df.drop(columns=['has_duplicates']).to_csv('../submissions/candidate_ver4.csv', index=False)