In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import combinations
from tqdm import tqdm

In [2]:
train_log_df = pd.read_csv('../train_log.csv')
test_log_df = pd.read_csv('../test_log.csv')
train_label_df = pd.read_csv('../train_label.csv')
test_session_df = pd.read_csv('../test_session.csv')
yado_df = pd.read_csv('../yado.csv')
image_embeddings_df = pd.read_parquet('../image_embeddings.parquet')
sample_submission_df = pd.read_csv('../sample_submission.csv')

In [3]:
train_log_df

Unnamed: 0,session_id,seq_no,yad_no
0,000007603d533d30453cc45d0f3d119f,0,2395
1,0000ca043ed437a1472c9d1d154eb49b,0,13535
2,0000d4835cf113316fe447e2f80ba1c8,0,123
3,0000fcda1ae1b2f431e55a7075d1f500,0,8475
4,000104bdffaaad1a1e0a9ebacf585f33,0,96
...,...,...,...
419265,ffffcd5bc19d62cad5a3815c87818d83,0,12230
419266,ffffcd5bc19d62cad5a3815c87818d83,1,10619
419267,ffffcd5bc19d62cad5a3815c87818d83,2,12230
419268,fffffa7baf370083ebcdd98f26a7e31a,0,2439


In [7]:
# trainとtestをconcatした共訪問行列の作成
def create_covisitation_matrix(combined_log_df):
    # セッションごとに訪問された宿の組み合わせを生成
    pairs = []
    for session_id, group in tqdm(combined_log_df.groupby('session_id'), desc="Processing sessions"):
        accommodations = group['yad_no'].tolist()
        for pair in combinations(accommodations, 2):
            pairs.append(pair)

    # 共訪問のカウント
    pairs_df = pd.DataFrame(pairs, columns=['yad_no1', 'yad_no2'])
    covisitation_counts = pairs_df.groupby(['yad_no1', 'yad_no2']).size().reset_index(name='count')

    # 行列の作成
    covisitation_matrix = covisitation_counts.pivot(index='yad_no1', columns='yad_no2', values='count').fillna(0)

    return covisitation_matrix

# train_log_df と test_log_df を結合
combined_log_df = pd.concat([train_log_df, test_log_df], axis=0, ignore_index=True)

# 共訪問行列の作成
covisitation_matrix = create_covisitation_matrix(combined_log_df)
covisitation_matrix

Processing sessions: 100%|██████████| 463398/463398 [00:27<00:00, 16877.19it/s]


yad_no2,1,2,3,4,5,7,9,10,12,15,...,13796,13797,13798,13799,13800,13801,13803,13804,13805,13806
yad_no1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0
13801,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0
13803,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
13804,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0


In [4]:
import cudf
from cudf.core.dataframe import DataFrame as cuDataFrame
from tqdm import tqdm
from itertools import combinations


In [5]:
import cudf

# cuDFを使ってCSVファイルを読み込む
train_log_df = cudf.read_csv('../train_log.csv')
test_log_df = cudf.read_csv('../test_log.csv')
train_label_df = cudf.read_csv('../train_label.csv')
test_session_df = cudf.read_csv('../test_session.csv')
yado_df = cudf.read_csv('../yado.csv')

# cuDFを使ってParquetファイルを読み込む
image_embeddings_df = cudf.read_parquet('../image_embeddings.parquet')

# サンプル提出用のDataFrameもcuDFで読み込む
sample_submission_df = cudf.read_csv('../sample_submission.csv')

In [6]:
import cudf
from itertools import combinations
from tqdm import tqdm


# train_log_dfとtest_log_dfを結合
combined_log_df = cudf.concat([train_log_df, test_log_df], axis=0, ignore_index=True)

# trainとtestをconcatした共訪問行列の作成
def create_covisitation_matrix(combined_log_df):
    # セッションごとに訪問された宿の組み合わせを生成
    pairs = []
    for session_id, group in tqdm(combined_log_df.to_pandas().groupby('session_id'), desc="Processing sessions"):
        accommodations = group['yad_no'].tolist()
        for pair in combinations(accommodations, 2):
            pairs.append(pair)

    # 共訪問のカウント
    pairs_df = cudf.DataFrame(pairs, columns=['yad_no1', 'yad_no2'])
    covisitation_counts = pairs_df.groupby(['yad_no1', 'yad_no2']).size().to_frame(name='count').reset_index()

    # ピボット操作はcuDFでは直接サポートされていないため、Pandasに戻して操作
    covisitation_matrix = covisitation_counts.to_pandas().pivot(index='yad_no1', columns='yad_no2', values='count').fillna(0)

    # 最終的な行列をGPU上のDataFrameに変換
    covisitation_matrix = cudf.from_pandas(covisitation_matrix)

    return covisitation_matrix

# 共訪問行列の作成
covisitation_matrix = create_covisitation_matrix(combined_log_df)
print(covisitation_matrix)


Processing sessions: 100%|██████████| 463398/463398 [00:27<00:00, 16622.76it/s]


         1      2      3      4      5      7      9      10     12     15     \
yad_no1                                                                         
1          0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
2          0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
3          0.0    0.0   19.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
4          0.0    0.0    0.0    4.0    0.0    0.0    0.0    0.0    0.0    0.0   
5          0.0    0.0    0.0    0.0    3.0    0.0    0.0    0.0    0.0    0.0   
...        ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
13800      0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
13801      0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
13803      0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
13804      0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
13806      0.0    0.0    0.0

In [7]:
import cudf

# ステップ 1: 指定セッションの宿を抽出
def get_session_accommodations(session_id, train_log_gdf):
    # セッションIDに一致するレコードをフィルタリング
    session_records = train_log_gdf[train_log_gdf['session_id'] == session_id]

    # セッション内の宿のリストを取得
    yado_list = session_records['yad_no'].to_array()

    # 最後に訪問された宿を保持
    last_yado = yado_list[-1] if len(yado_list) > 0 else None

    # 重複を排除（ただし最後に訪問された宿は保持）
    unique_yado = cudf.Series(yado_list).drop_duplicates().to_pandas().tolist()

    return last_yado, unique_yado

# 例：特定のsession_idで出現した宿（重複なし）と、そのセッションの最後に現れた宿
last_yado, unique_session_yado = get_session_accommodations(session_id='ffffcd5bc19d62cad5a3815c87818d83', train_log_gdf=train_log_df)
print(last_yado)
print(unique_session_yado)


12230
[10619, 12230]


In [8]:
# ステップ 2: 共訪問行列に基づく推薦
def get_recommendations_from_covisitation(yad_no, covisitation_matrix, exclude_list=None, num_recommendations=10):
    # 特定の宿に関連する宿の共訪問回数を取得
    related_accommodations = covisitation_matrix.loc[yad_no].sort_values(ascending=False)

    # cuDF DataFrameをPandas DataFrameに変換
    related_accommodations = related_accommodations.to_pandas()

    # 除外リストを考慮してトップの推薦を選出
    top_recommendations = []
    for accommodation in related_accommodations.index:
        if accommodation not in exclude_list:
            top_recommendations.append(accommodation)
            if len(top_recommendations) == num_recommendations:
                break
    
    return top_recommendations

# 例: 宿IDが3338の宿に対する推薦を取得（covisitation_matrixは前のステップで作成されたものとする）
recommendations = get_recommendations_from_covisitation(3338, covisitation_matrix, exclude_list=[])
print(recommendations)

[12350, 3338, 915, 13468, 532, 8553, 3848, 10118, 4913, 1091]


In [9]:
# ステップ3：全体の人気順に基づく補完
def get_popular_accommodations(train_log_gdf, exclude_list, num_recommendations=10):
    # 各宿の訪問回数をカウント
    popularity_counts = train_log_gdf['yad_no'].value_counts()

    # cuDFでは直接tolist()が使えないため、to_pandas()で変換
    popular_accommodations = popularity_counts.index.to_pandas().tolist()
    
    # 重複を除外
    recommendations = [yad_no for yad_no in popular_accommodations if yad_no not in exclude_list]

    # トップの推薦を選出
    top_recommendations = recommendations[:num_recommendations]
    return top_recommendations

# 例: 除外リスト（既に選ばれた候補）に基づいて追加の推薦を取得
combined_log_gdf = cudf.concat([train_log_df, test_log_df], axis=0, ignore_index=True)  # cuDFで結合
exclude_list = []  # 既に選ばれた宿のIDリスト
additional_recommendations = get_popular_accommodations(combined_log_gdf, exclude_list)
print(additional_recommendations)

[12350, 719, 3338, 13468, 10095, 8567, 532, 8553, 2201, 915]


In [10]:
import cudf
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# 以前に定義した関数などをここに含める
# ...

# 推薦生成関数
def generate_recommendation(i, test_session_gdf, test_log_gdf, covisitation_matrix, train_log_gdf):
    pred_recommendations = []
    
    session_id = test_session_gdf.loc[i, 'session_id']
    yado_no = test_log_gdf[test_log_gdf['session_id'] == session_id]['yad_no'].to_pandas().tolist()
    last_yado = yado_no[-1] if yado_no else None
    yado_no = cudf.Series(yado_no).drop_duplicates().to_pandas().tolist()
    
    pred_recommendations.extend(yado_no)
    pred_recommendations.remove(last_yado) if last_yado in pred_recommendations else None
    
    for y in yado_no:
        try:
            yado_covisit = get_recommendations_from_covisitation(y, covisitation_matrix, exclude_list=pred_recommendations + [last_yado])
            pred_recommendations.extend(yado_covisit)
        except:
            pass
    
    additional_recommendations = get_popular_accommodations(train_log_gdf, exclude_list=pred_recommendations + [last_yado])
    pred_recommendations.extend(additional_recommendations)
    
    return i, pred_recommendations[:10]

# 推薦の生成
results = []
test_session_gdf = test_session_df
test_log_gdf = test_log_df
train_log_gdf = train_log_df

# 全てのセッションIDに対してループ処理
for i in tqdm(range(len(test_session_gdf)), desc="Processing recommendations"):
    index, result = generate_recommendation(i, test_session_gdf, test_log_gdf, covisitation_matrix, train_log_gdf)
    results.append((index, result))

# 結果の整理
results.sort(key=lambda x: x[0])
preds = [result for i, result in results]


Processing recommendations:   1%|          | 1155/174700 [31:55<86:21:10,  1.79s/it] 