In [1]:
import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
from itertools import combinations
from tqdm import tqdm
#from concurrent.futures import ThreadPoolExecutor, as_completed
from concurrent.futures import ProcessPoolExecutor, as_completed

In [2]:
train_log_df = pd.read_csv('../train_log.csv')
test_log_df = pd.read_csv('../test_log.csv')
train_label_df = pd.read_csv('../train_label.csv')
test_session_df = pd.read_csv('../test_session.csv')
yado_df = pd.read_csv('../yado.csv')
image_embeddings_df = pd.read_parquet('../image_embeddings.parquet')
sample_submission_df = pd.read_csv('../sample_submission.csv')

In [3]:
train_log_df

Unnamed: 0,session_id,seq_no,yad_no
0,000007603d533d30453cc45d0f3d119f,0,2395
1,0000ca043ed437a1472c9d1d154eb49b,0,13535
2,0000d4835cf113316fe447e2f80ba1c8,0,123
3,0000fcda1ae1b2f431e55a7075d1f500,0,8475
4,000104bdffaaad1a1e0a9ebacf585f33,0,96
...,...,...,...
419265,ffffcd5bc19d62cad5a3815c87818d83,0,12230
419266,ffffcd5bc19d62cad5a3815c87818d83,1,10619
419267,ffffcd5bc19d62cad5a3815c87818d83,2,12230
419268,fffffa7baf370083ebcdd98f26a7e31a,0,2439


In [4]:
pd.concat([train_log_df, test_log_df], axis=0, ignore_index=True)['yad_no'].value_counts()

12350    1606
719      1520
3338     1492
13468    1373
10095    1313
         ... 
9487        1
6437        1
9642        1
9976        1
9348        1
Name: yad_no, Length: 13562, dtype: int64

In [5]:
# trainとtestをconcatした共訪問行列の作成
def create_covisitation_matrix(combined_log_df):
    # セッションごとに訪問された宿の組み合わせを生成
    pairs = []
    for session_id, group in tqdm(combined_log_df.groupby('session_id'), desc="Processing sessions"):
        accommodations = group['yad_no'].tolist()
        for pair in combinations(accommodations, 2):
            pairs.append(pair)

    # 共訪問のカウント
    pairs_df = pd.DataFrame(pairs, columns=['yad_no1', 'yad_no2'])
    covisitation_counts = pairs_df.groupby(['yad_no1', 'yad_no2']).size().reset_index(name='count')

    # 行列の作成
    covisitation_matrix = covisitation_counts.pivot(index='yad_no1', columns='yad_no2', values='count').fillna(0)

    return covisitation_matrix

# train_log_df と test_log_df を結合
combined_log_df = pd.concat([train_log_df, test_log_df], axis=0, ignore_index=True)

# 共訪問行列の作成
covisitation_matrix = create_covisitation_matrix(combined_log_df)
covisitation_matrix

Processing sessions: 100%|██████████| 463398/463398 [00:27<00:00, 16632.82it/s]


yad_no2,1,2,3,4,5,7,9,10,12,15,...,13796,13797,13798,13799,13800,13801,13803,13804,13805,13806
yad_no1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0
13801,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0
13803,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
13804,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0


In [6]:
# ステップ 1: 指定セッションの宿を抽出
def get_session_accommodations(session_id, log_df):
    # セッションIDに一致するレコードをフィルタリング
    session_records = log_df[log_df['session_id'] == session_id]
    # セッション内の宿のリストを取得
    yado_list = session_records['yad_no'].tolist()
    # 最後に訪問された宿を保持
    last_yado = yado_list[-1] if yado_list else None
    # 重複を排除（ただし最後に訪問された宿は保持）
    unique_yado = pd.Series(yado_list).drop_duplicates().tolist()

    return last_yado, unique_yado

# 例：特定のsession_idで出現した宿（重複なし）と、そのセッションの最後に現れた宿
last_yado, unique_session_yado = get_session_accommodations(session_id='ffffcd5bc19d62cad5a3815c87818d83', log_df=train_log_df)
print(last_yado)
print(unique_session_yado)

12230
[12230, 10619]


In [7]:
# ステップ 2: 訪問行列に基づく推薦
def get_recommendations_from_covisitation(yad_no, covisitation_matrix, exclude_list=None, num_recommendations=100):
    # 特定の宿に関連する宿の共訪問回数を取得
    related_accommodations = covisitation_matrix.loc[yad_no].sort_values(ascending=False)
    
    # 除外リストを考慮してトップの推薦を選出
    top_recommendations = []
    for accommodation in related_accommodations.index:
        if accommodation not in exclude_list:
            top_recommendations.append(accommodation)
            if len(top_recommendations) == num_recommendations:
                break
    
    return top_recommendations

# 例: 宿IDが333の宿に対する推薦を取得（covisitation_matrixは前のステップで作成されたものとする）
recommendations = get_recommendations_from_covisitation(333, covisitation_matrix, exclude_list=[])
print(recommendations)
print(len(recommendations))

[3370, 1, 9187, 9177, 9178, 9180, 9181, 9182, 9183, 9185, 9186, 9189, 9175, 9190, 9191, 9193, 9194, 9195, 9196, 9197, 9198, 9176, 9174, 9200, 9173, 9152, 9153, 9154, 9155, 9156, 9157, 9158, 9160, 9161, 9162, 9163, 9164, 9166, 9167, 9168, 9169, 9170, 9171, 9172, 9199, 9201, 9150, 9227, 9229, 9230, 9231, 9232, 9233, 9234, 9235, 9236, 9237, 9238, 9239, 9240, 9241, 9242, 9243, 9244, 9245, 9247, 9248, 9228, 9226, 9202, 9225, 9203, 9204, 9207, 9208, 9209, 9210, 9211, 9212, 9213, 9214, 9215, 9216, 9218, 9219, 9220, 9221, 9222, 9223, 9224, 9151, 9149, 9251, 9100, 9072, 9073, 9074, 9075, 9076, 9077]
100


In [8]:
# ステップ3：全体の人気順に基づく補完
def get_popular_accommodations(log_df, exclude_list, num_recommendations=100):
    # 各宿の訪問回数をカウント
    popularity_counts = log_df['yad_no'].value_counts()

    # 重複を除外
    popular_accommodations = popularity_counts.index.tolist()
    recommendations = [yad_no for yad_no in popular_accommodations if yad_no not in exclude_list]

    # トップの推薦を選出
    top_recommendations = recommendations[:num_recommendations]
    return top_recommendations

# 例: 除外リスト（既に選ばれた候補）に基づいて追加の推薦を取得

# train_log_df と test_log_df を結合
combined_log_df = pd.concat([train_log_df, test_log_df], axis=0, ignore_index=True)
# exclude_list = [100, 101, 102]  # 既に選ばれた宿のIDリスト
additional_recommendations = get_popular_accommodations(log_df = combined_log_df, exclude_list=[], num_recommendations=100)
print(additional_recommendations)
print(len(additional_recommendations))

[12350, 719, 3338, 13468, 10095, 8567, 532, 8553, 2201, 915, 3848, 11037, 12017, 385, 1818, 4913, 2797, 6199, 6470, 5116, 2927, 693, 10118, 9020, 9104, 8445, 11398, 12089, 13402, 10175, 10418, 10827, 3077, 5607, 755, 1091, 7093, 11499, 9248, 13017, 12425, 6407, 9830, 5754, 13292, 2087, 2445, 11407, 5445, 6178, 1390, 10350, 5135, 307, 11850, 12962, 13106, 5948, 3184, 3100, 12946, 2862, 8030, 7888, 3694, 3274, 12178, 5542, 3988, 7947, 1037, 7117, 4646, 4522, 11715, 6731, 6418, 12132, 9971, 12240, 1229, 12785, 12524, 1510, 2974, 109, 12707, 7649, 589, 9563, 10020, 3228, 6605, 11496, 2977, 12986, 12279, 8094, 2843, 496]
100


In [9]:
# train_log_df と test_log_df を結合
combined_log_df = pd.concat([train_log_df, test_log_df], axis=0, ignore_index=True)

# セッションIDごとに yad_no のリストを作成
session_yadno_map = train_log_df.groupby('session_id')['yad_no'].apply(list).to_dict() #ここをtrain, testで変える  train_log_df, test_log_df

def generate_recommendation(i):
    pred_recommendations = []

    session_id = train_label_df.loc[i, 'session_id']  #ここをtrain, testで変える  train_label_df, test_session_df
    yado_no = session_yadno_map.get(session_id, [])
    last_yado = yado_no[-1] if yado_no else None
    yado_no = pd.Series(yado_no).drop_duplicates().tolist()
    
    pred_recommendations.extend(yado_no)
    if last_yado in pred_recommendations:
        pred_recommendations.remove(last_yado)
    
    for y in yado_no:
        try:
            yado_covisit = get_recommendations_from_covisitation(y, covisitation_matrix, exclude_list=pred_recommendations + [last_yado])
            pred_recommendations.extend(yado_covisit)
        except:
            pass
    
    additional_recommendations = get_popular_accommodations(combined_log_df, exclude_list=pred_recommendations + [last_yado])
    pred_recommendations.extend(additional_recommendations)
    
    return i, pred_recommendations[:100]


# マルチプロセス処理の実行
results = []
with ProcessPoolExecutor(max_workers=16) as executor:
    future_to_session = {executor.submit(generate_recommendation, i): i for i in range(len(train_label_df))}
    for future in tqdm(as_completed(future_to_session), total=len(train_label_df), desc="Processing recommendations"):
        index = future_to_session[future]
        result = future.result()
        results.append((index, result))

results.sort(key=lambda x: x[0])
preds = [result for i, result in results]

Processing recommendations: 100%|██████████| 288698/288698 [20:12<00:00, 238.20it/s]


In [10]:
submission_df = pd.DataFrame(preds)
submission_df
submission_df[1]

# 推薦リストを新しい列に展開する
expanded_df = submission_df[1].apply(pd.Series)
# 新しい列名を設定する
expanded_df.columns = [f'predict_{i}' for i in range(expanded_df.shape[1])]
# 最初の列にセッションIDを追加する
# expanded_df.insert(0, 'session_id', submission_df.index)
# 展開されたDataFrameを表示
expanded_df

Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9,...,predict_90,predict_91,predict_92,predict_93,predict_94,predict_95,predict_96,predict_97,predict_98,predict_99
0,11882,2808,4101,5289,9187,9178,9180,9181,9182,9183,...,9153,9251,9073,9075,9076,9077,9079,9080,9081,9085
1,8253,8747,2259,4488,2570,3725,1586,9196,9183,9177,...,9222,9154,9151,9152,9073,9075,9076,9077,9079,9080
2,12350,719,3338,13468,10095,8567,532,8553,2201,915,...,10020,3228,6605,11496,2977,12986,12279,8094,2843,496
3,626,2272,7812,11715,755,7872,9183,9177,9178,9180,...,9154,9151,9152,9073,9075,9076,9077,9079,9080,9081
4,96,902,12491,5490,1284,7749,11380,7599,4072,9067,...,9213,9214,9215,9216,9218,9219,9220,9157,9150,9155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288693,13210,13079,2876,5719,3564,10955,1092,8677,10522,13717,...,9210,9211,9222,9212,9213,9214,9215,9216,9221,9218
288694,3940,8703,399,6654,5299,4767,4334,8465,963,2900,...,9218,9219,9220,9221,9153,9148,9151,9150,9075,9076
288695,7308,12240,7820,4040,4398,2087,3566,10364,844,9558,...,9214,9215,9216,9218,9219,9220,9221,9156,9149,9154
288696,10619,570,12500,11091,7551,10616,3238,12829,11316,12781,...,9230,9197,9198,9155,9200,9201,9229,9202,9203,9199


In [11]:
# 各行内で重複があるかどうかをチェック

def check_row_duplicates(row):
    # setを用いてユニークな要素のみを抽出し、その長さが元の行の長さと等しいかを比較
    return len(row) != len(set(row))
# check_row_duplicates関数を各行に適用し、結果を新しい列に格納
expanded_df['has_duplicates'] = expanded_df.apply(check_row_duplicates, axis=1)
# 重複がある行のみをフィルタリング
rows_with_duplicates = expanded_df[expanded_df['has_duplicates']]
# 結果を表示
print(len(rows_with_duplicates))

0


In [None]:
# 欠損がないかも確認したい

In [12]:
expanded_df.head()

Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9,...,predict_91,predict_92,predict_93,predict_94,predict_95,predict_96,predict_97,predict_98,predict_99,has_duplicates
0,11882,2808,4101,5289,9187,9178,9180,9181,9182,9183,...,9251,9073,9075,9076,9077,9079,9080,9081,9085,False
1,8253,8747,2259,4488,2570,3725,1586,9196,9183,9177,...,9154,9151,9152,9073,9075,9076,9077,9079,9080,False
2,12350,719,3338,13468,10095,8567,532,8553,2201,915,...,3228,6605,11496,2977,12986,12279,8094,2843,496,False
3,626,2272,7812,11715,755,7872,9183,9177,9178,9180,...,9151,9152,9073,9075,9076,9077,9079,9080,9081,False
4,96,902,12491,5490,1284,7749,11380,7599,4072,9067,...,9214,9215,9216,9218,9219,9220,9157,9150,9155,False


In [13]:
# map@Kの計算

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k for a single actual value.

    Parameters:
    actual : int
        The actual value that is to be predicted
    predicted : list
        A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns:
    float
        The average precision at k
    """
    if actual in predicted[:k]:
        return 1.0 / (predicted[:k].index(actual) + 1)
    return 0.0

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k for lists of actual values and predicted values.

    Parameters:
    actual : list
        A list of actual values that are to be predicted
    predicted : list
        A list of lists of predicted elements (order does matter in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns:
    float
        The mean average precision at k
    """
    return sum(apk(a, p, k) for a, p in zip(actual, predicted)) / len(actual)

In [14]:
train_label_df['yad_no'].values.reshape(-1, 1)

array([[ 4101],
       [ 8253],
       [ 4863],
       ...,
       [13719],
       [10619],
       [ 2439]])

In [15]:
expanded_df.drop(columns=['has_duplicates']).values

array([[11882,  2808,  4101, ...,  9080,  9081,  9085],
       [ 8253,  8747,  2259, ...,  9077,  9079,  9080],
       [12350,   719,  3338, ...,  8094,  2843,   496],
       ...,
       [ 7308, 12240,  7820, ...,  9156,  9149,  9154],
       [10619,   570, 12500, ...,  9202,  9203,  9199],
       [ 2439,  2981, 10095, ...,  9074,  9075,  9076]])

In [16]:
# MAP@kの計算
k = 50
y_true = train_label_df['yad_no'].values.reshape(-1, 1)
y_pred = expanded_df.drop(columns=['has_duplicates']).values
print("MAP@k:", mapk(y_true, y_pred, k=10))

AttributeError: 'numpy.ndarray' object has no attribute 'index'

In [17]:
y_true

array([[ 4101],
       [ 8253],
       [ 4863],
       ...,
       [13719],
       [10619],
       [ 2439]])

In [18]:
y_pred

array([[11882,  2808,  4101, ...,  9080,  9081,  9085],
       [ 8253,  8747,  2259, ...,  9077,  9079,  9080],
       [12350,   719,  3338, ...,  8094,  2843,   496],
       ...,
       [ 7308, 12240,  7820, ...,  9156,  9149,  9154],
       [10619,   570, 12500, ...,  9202,  9203,  9199],
       [ 2439,  2981, 10095, ...,  9074,  9075,  9076]])

In [19]:
# Recall(候補にあげたものが)のy_trueに含まれているものの割合

# マッチするかどうかをチェックするリストを初期化
matches = []

# y_pred の各サブアレイと y_true の値を比較
for pred, true in zip(y_pred, y_true):
    # pred はサブアレイ、true は値
    matches.append(int(true in pred))

# matches は、指定された条件に基づいて 0 と 1 のリストです
print(matches)

# matches は先のステップで生成した 0 と 1 のリストです。
# 1 の割合を計算
one_ratio = sum(matches) / len(matches)

# 割合をプリント
print("Recall is: {:.2%}".format(one_ratio))

[1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 

In [15]:
expanded_df.drop(columns=['has_duplicates']).to_csv('../submissions/candidate_ver4.csv', index=False)