In [1]:
import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
from itertools import combinations
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed

In [2]:
train_log_df = pd.read_csv('train_log.csv')
test_log_df = pd.read_csv('test_log.csv')
train_label_df = pd.read_csv('train_label.csv')
test_session_df = pd.read_csv('test_session.csv')
yado_df = pd.read_csv('yado.csv')
image_embeddings_df = pd.read_parquet('image_embeddings.parquet')

In [3]:
# train_logとtrain_labelを結合して正解かどうかを確認する
train_log_df = train_log_df.merge(train_label_df, on="session_id", how="left", suffixes=["", "_target"])
train_log_df["target"] = train_log_df["yad_no"] == train_log_df["yad_no_target"]
display(train_log_df.head())

Unnamed: 0,session_id,seq_no,yad_no,yad_no_target,target
0,000007603d533d30453cc45d0f3d119f,0,2395,4101,False
1,0000ca043ed437a1472c9d1d154eb49b,0,13535,8253,False
2,0000d4835cf113316fe447e2f80ba1c8,0,123,4863,False
3,0000fcda1ae1b2f431e55a7075d1f500,0,8475,1652,False
4,000104bdffaaad1a1e0a9ebacf585f33,0,96,96,True


In [4]:
# sessionごとに最大のseq_noを結合する
train_log_max_seq_no = train_log_df.groupby("session_id")["seq_no"].max()
train_log_max_seq_no.name = "max_seq_no"
train_log_df = train_log_df.merge(train_log_max_seq_no, on="session_id", how="left")
train_log_df

Unnamed: 0,session_id,seq_no,yad_no,yad_no_target,target,max_seq_no
0,000007603d533d30453cc45d0f3d119f,0,2395,4101,False,0
1,0000ca043ed437a1472c9d1d154eb49b,0,13535,8253,False,0
2,0000d4835cf113316fe447e2f80ba1c8,0,123,4863,False,0
3,0000fcda1ae1b2f431e55a7075d1f500,0,8475,1652,False,0
4,000104bdffaaad1a1e0a9ebacf585f33,0,96,96,True,1
...,...,...,...,...,...,...
419265,ffffcd5bc19d62cad5a3815c87818d83,0,12230,10619,False,2
419266,ffffcd5bc19d62cad5a3815c87818d83,1,10619,10619,True,2
419267,ffffcd5bc19d62cad5a3815c87818d83,2,12230,10619,False,2
419268,fffffa7baf370083ebcdd98f26a7e31a,0,2439,2439,True,1


In [67]:
train_log_df.value_counts("target")

target
False    326714
True      92556
dtype: int64

In [68]:
train_log_df.value_counts('max_seq_no')

max_seq_no
0    185386
1    165586
2     46050
3     16100
4      4165
5      1338
6       455
7       144
8        36
9        10
dtype: int64

In [69]:
train_log_df[train_log_df["max_seq_no"] !=0].target.value_counts()

False    141328
True      92556
Name: target, dtype: int64

In [7]:
A = [1,1,1]
print(len(A))

3


In [17]:
train_log_df[train_log_df["max_seq_no"] == 2].groupby("session_id")['yad_no'].nunique().value_counts()

2    10418
3     4932
Name: yad_no, dtype: int64

In [18]:
train_log_df[train_log_df["max_seq_no"] == 3].groupby("session_id")['yad_no'].nunique().value_counts()

2    3250
3     545
4     230
Name: yad_no, dtype: int64

In [63]:
train_log_df[(train_log_df["max_seq_no"] == 5) & (train_log_df['seq_no'] == 3)]['target'].value_counts()

False    222
True       1
Name: target, dtype: int64