# ReRank@20 by catboost reranker
- https://www.kaggle.com/code/pietromaldini1/4th-place-solution-ensemble-catboost-ranker#Prediction
- polars ref: https://qiita.com/nkay/items/9cfb2776156dc7e054c8
- polars ref: https://stackoverflow.com/questions/73948502/take-cumsum-of-each-row-in-polars
- polars ref: https://stackoverflow.com/questions/73101521/polars-equivalent-to-pandas-groupby-shift
- improvement after competition 
- implement by polars
- (PairLogitPairwise loss with 100 pairs)
- update w2vec cands and fix rank cands * 

In [1]:
!pip install polars

Collecting polars
  Downloading polars-0.16.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.2/15.2 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: polars
Successfully installed polars-0.16.2
[0m

In [2]:
import gc
import os
import glob
import random
import numpy as np
import pandas as pd
from pathlib import Path
from copy import deepcopy
from tqdm.notebook import tqdm
import pickle

from catboost import CatBoost, CatBoostClassifier, CatBoostRanker
from catboost import Pool
from collections import Counter

import polars as pl

In [3]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    
seed_everything()

In [4]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}
VALID = True

if VALID:
    BASE_PATH = "../input/otto-valid-candidate/"
    NEW_BASE_PATH = "../input/otto-valid-new-candidates/"
    test_path = '../input/otto-validation/test_parquet/*'
else:
    BASE_PATH = "../input/otto-candidate/"
    NEW_BASE_PATH = "../input/otto-new-candidates/"
    test_path = '../input/otto-chunk-data-inparquet-format/test_parquet/*'

# load data

In [5]:
def load_test():    
    dfs = []
    for e, chunk_file in enumerate(glob.glob(test_path)):
        chunk = pl.read_parquet(chunk_file)
        values = pl.Series(chunk['ts'] / 1000)
        chunk = chunk.with_columns(values.alias('ts').cast(pl.Int32))
        dfs.append(chunk)
    df = pl.concat(dfs)
    df = df.with_columns(pl.Series(name='type', values = df['type'].apply(lambda x: type_labels[x]), dtype= pl.Int8))
    return df
    
def load_train(minimum_session_no, valid_flg, additional_df_size = 1000000):
    # always download train data for validation 
    df = pl.read_parquet('../input/otto-truncated-tr-dataset/truncated_train_sessions_for_validation.pqt')
    df = df.filter(pl.col("session") > minimum_session_no)
    label_df = pl.read_parquet('../input/otto-truncated-tr-dataset/truncated_train_labels_for_validation.pqt')
    label_df = label_df.filter(pl.col("session") > minimum_session_no)
    label_df = label_df.select(["session", "type", "aid"])
    df_unique_session = df.unique(subset=["session"]).select("session").to_numpy().reshape(1,-1)[0]
    maximum_session_no = df_unique_session.max()
     
    if not valid_flg:
        # additional session size for full train
        add_df = pl.read_parquet('../input/otto-truncated-tr-dataset/truncated_train_sessions_for_full_train.pqt')
        cons_session = (
            add_df
            .filter(~pl.col("session").is_in(df_unique_session))["session"]
            .unique(subset=["session"]).select("session").to_numpy().reshape(1,-1)[0]
        )
        cons_session = cons_session[cons_session > maximum_session_no][-additional_df_size:]
        add_df = add_df.filter(pl.col("session").is_in(cons_session))
        print("additional session no: ", len(cons_session))
        df = pl.concat([df, add_df])
        del add_df
        gc.collect()
            
        add_label_df = pl.read_parquet('../input/otto-truncated-tr-dataset/truncated_train_labels_for_full_train.pqt')
        add_label_df = add_label_df.filter(pl.col("session").is_in(cons_session)).droplevel("level_0", axis=0)
        add_label_df = add_label_df.select(["session", "type", "aid"])
        label_df = pl.concat([label_df, add_label_df])
        del add_label_df
        gc.collect()
            
    df = df.with_columns(pl.Series(name='type', values = df['type'].apply(lambda x: type_labels[x]), dtype= pl.Int8))
    values = pl.Series(df['ts'])
    df = df.with_columns(values.alias('ts').cast(pl.Int32))
    
    label_df = label_df.with_columns(pl.Series(name='label', values = np.repeat(1, label_df.shape[0]), dtype= pl.Int8))
    label_df = label_df.with_columns(pl.Series(name='aid', values = label_df['aid'], dtype= pl.Int32))
    label_df = label_df.with_columns(pl.Series(name='session', values = label_df['session'], dtype= pl.Int32))    
    
    return df, label_df

In [6]:
# fix session
if VALID:
    session_size_in_tr_for_val = 1500000
else:
    session_size_in_tr_for_val = 500000
    
df = pl.read_parquet('../input/otto-truncated-tr-dataset/truncated_train_sessions_for_validation.pqt')
minimum_session_no = df.unique(subset=["session"]).select("session") 
minimum_session_no = minimum_session_no.to_numpy().reshape(1,-1)[0][-session_size_in_tr_for_val]
del df

In [7]:
org_test_data = load_test()
org_train_data, train_labels = load_train(minimum_session_no, VALID)

print('Train data has shape',org_train_data.shape)
print('Test data has shape',org_test_data.shape)

Train data has shape (6273663, 4)
Test data has shape (7683577, 4)


In [8]:
week_session = pl.read_parquet('/kaggle/input/ottofeatures/week_session.pqt')
print(week_session.shape)
week_session = dict(zip(week_session["session"].to_numpy(), week_session["week"].to_numpy()))

(14571582, 3)


# feature engineering in history

In [9]:
def make_history(input_df):
    session_aid = input_df.groupby("session").agg(pl.col("aid").apply(list))
    session_ts = input_df.groupby("session").agg(pl.col("ts").apply(list))
    session_type = input_df.groupby("session").agg(pl.col("type").apply(list))
    session_info = session_aid.join(session_ts, on = "session", how = "inner")
    session_info = session_info.join(session_type, on = "session", how = "inner")
    del session_aid, session_ts, session_type
    return session_info

In [10]:
def make_features_in_history(input_df):
    values=input_df['aid'].apply(lambda x: len(x))
    input_df = input_df.with_columns(values.alias('session_length').cast(pl.Int16))

    values=input_df['aid'].apply(lambda x: len(set(x)))
    input_df = input_df.with_columns(values.alias('unique_aid').cast(pl.Int16))

    values = input_df["unique_aid"] / input_df["session_length"]
    input_df = input_df.with_columns(values.alias('unique_ratio').cast(pl.Float32))           
    return input_df

In [11]:
session_features = ['session_length', 'unique_aid', 'unique_ratio'] 

train_history = make_history(org_train_data)
train_history = make_features_in_history(train_history)
train_history = train_history.select(['session'] + session_features)

test_history = make_history(org_test_data)
test_history = make_features_in_history(test_history)
test_history= test_history.select(['session'] + session_features)

In [12]:
test_history.head()

session,session_length,unique_aid,unique_ratio
i32,i16,i16,f32
12197408,10,5,0.5
12682792,1,1,1.0
11269560,5,5,1.0
11271984,6,6,1.0
11412764,4,3,0.75


In [13]:
train_history.head()

session,session_length,unique_aid,unique_ratio
i32,i16,i16,f32
10122884,1,1,1.0
10534140,1,1,1.0
10804500,3,3,1.0
10916116,6,5,0.833333
10937140,11,6,0.545455


In [14]:
train_session_aid_count = org_train_data.groupby(["session", "aid", "type"]).count()
train_session_aid_count = train_session_aid_count.pivot('count', ['session', 'aid'], 'type') #value, index ,column
train_session_aid_count = train_session_aid_count.fill_null(0)
train_session_aid_count = train_session_aid_count.select(["session", "aid", "0", "1", "2"])

test_session_aid_count = org_test_data.groupby(["session", "aid", "type"]).count()
test_session_aid_count = test_session_aid_count.pivot('count', ['session', 'aid'], 'type')
test_session_aid_count = test_session_aid_count.fill_null(0)
test_session_aid_count = test_session_aid_count.select(["session", "aid", "0", "1", "2"])

session_aid_count = pl.concat([train_session_aid_count, test_session_aid_count])
del train_session_aid_count, test_session_aid_count

session_aid_count.columns = ["session", "aid", "type0", "type1", "type2"]

for col in ["type0", "type1", "type2"]:
    session_aid_count = session_aid_count.with_columns(pl.Series(name=col, values = session_aid_count[col], dtype= pl.Int16))

In [15]:
train_session_last_ts = org_train_data.groupby("session").agg(pl.max("ts"))
train_session_last_ts.columns = ["session", "last_ts"]
test_session_last_ts = org_test_data.groupby("session").agg(pl.max("ts"))
test_session_last_ts.columns = ["session", "last_ts"]

tmp_train_data = org_train_data.join(train_session_last_ts, on = "session", how = "left")
tmp_test_data = org_test_data.join(test_session_last_ts, on = "session", how = "left")

tmp_train_values = 1 / ((tmp_train_data["last_ts"] - tmp_train_data["ts"]) + 1)
tmp_test_values = 1 / ((tmp_test_data["last_ts"] - tmp_test_data["ts"]) + 1)

tmp_train_data = tmp_train_data.with_columns(pl.Series(name='diff_ts', values = tmp_train_values, dtype= pl.Float32))
tmp_test_data = tmp_test_data.with_columns(pl.Series(name='diff_ts', values = tmp_test_values, dtype= pl.Float32))

train_session_weighted_count = tmp_train_data.groupby(["session", "aid", "type"]).agg(pl.sum("diff_ts"))
train_session_weighted_count = train_session_weighted_count.pivot('diff_ts', ['session', 'aid'], 'type')
train_session_weighted_count = train_session_weighted_count.fill_null(0)
train_session_weighted_count = train_session_weighted_count.select(["session", "aid", "0", "1", "2"])

test_session_weighted_count = tmp_test_data.groupby(["session", "aid", "type"]).agg(pl.sum("diff_ts"))
test_session_weighted_count = test_session_weighted_count.pivot('diff_ts', ['session', 'aid'], 'type')
test_session_weighted_count = test_session_weighted_count.fill_null(0)
test_session_weighted_count = test_session_weighted_count.select(["session", "aid", "0", "1", "2"])

session_weighted_count = pl.concat([train_session_weighted_count, test_session_weighted_count])
session_weighted_count.columns = ["session", "aid", "w_type0", "w_type1", "w_type2"]
del train_session_weighted_count, test_session_weighted_count
del tmp_train_data, tmp_test_data

for col in ["w_type0", "w_type1", "w_type2"]:
    session_weighted_count = session_weighted_count.with_columns(pl.Series(name=col, values = session_weighted_count[col], dtype= pl.Float32))

# learning and prediction

In [16]:
# session info
train_session = org_train_data.unique(subset=["session"]).select("session")
test_session = org_test_data.unique(subset=["session"]).select("session")
train_session = train_session.with_columns(pl.lit(1).alias("key").cast(pl.Int8))
test_session = test_session.with_columns(pl.lit(1).alias("key").cast(pl.Int8))

test_session_no = list(org_test_data.unique(subset=["session"]).select("session").to_numpy().reshape(1,-1)[0])

In [17]:
###########
def arrange_train_data(type_, train_session, final_cols, session_features, cand_feature_dfs, append_data):
    print("#####make candidates#####")
    tr_session_all_cand = make_candidates(type_, "train", train_session)
    print(tr_session_all_cand["session"]
          .value_counts()
          .with_columns(pl.col("counts").alias("session_cands"))["session_cands"].value_counts()
          .sort("session_cands"))
    
    print("#####concat data and cands#####")
    train = concat_train_data_cands(train_history, train_labels, tr_session_all_cand, type_) 
    del tr_session_all_cand
    print(train["session"]
          .value_counts()
          .with_columns(pl.col("counts").alias("session_cands"))["session_cands"].value_counts()
          .sort("session_cands"))
        
    print("#####feature engineering#####")
    X_train, y_train, train, session_lengths = fe_train_data(train, final_cols ,session_features, cand_feature_dfs, append_data)
    
    return X_train, y_train, train, session_lengths

#########
def arrange_test_data(type_, test_session, final_cols, session_features, cand_feature_dfs, start_index, end_index, append_data):
    # make candidates
    cons_test_session_no = test_session_no[start_index: end_index]
    te_session_all_cand = make_candidates(type_, "test", test_session.filter(pl.col("session").is_in(cons_test_session_no)))
    
    # concat data and cands
    test = concat_test_data_cands(test_history, te_session_all_cand) 
    del te_session_all_cand
        
    # feature engineering
    X_test, test = fe_test_data(test, final_cols ,session_features, cand_feature_dfs, append_data, cons_test_session_no)

    return X_test, test

In [18]:
def adjustment(input_df, order):
    input_df = input_df.with_columns(pl.lit(order).alias("cand_type").cast(pl.Int8))
    input_df = input_df.with_columns(input_df["cand_rank"].alias("cand_rank").cast(pl.Int16))
    input_df = input_df.with_columns(input_df["aid"].alias("aid").cast(pl.Int32))
    input_df = input_df.with_columns(input_df["session"].alias("session").cast(pl.Int32))
    input_df = input_df.select(["session", "aid", "cand_type", "cand_rank"])
    return input_df

def make_candidates(type_, tr_or_test, session_df):
    local_sessions = list(session_df.unique(subset=["session"]).select("session").to_numpy().reshape(1,-1)[0])
    
    ##### candidate1: chris candidates (valid ver7, full ver 9)#####
    if type_ == "clicks":
        if tr_or_test == "train":
            c_cand = pl.read_csv("../input/otto-valid-candidate/chris_session_click_train_cands.csv")
            if not VALID:
                add_c_cand = pl.read_csv("../input/otto-candidate/chris_session_click_train_cands.csv")
                c_cand = pl.concat([c_cand, add_c_cand])
                del add_c_cand
        else:
            c_cand = pl.read_csv(f"{BASE_PATH}"+"chris_session_click_cands.csv")
    elif type_ == "carts" or type_ == "orders":
        if tr_or_test == "train":
            c_cand = pl.read_csv("../input/otto-valid-candidate/chris_session_buy_train_cands.csv")
            if not VALID:
                add_c_cand = pl.read_csv("../input/otto-candidate/chris_session_buy_train_cands.csv")
                c_cand = pl.concat([c_cand, add_c_cand])
                del add_c_cand
        else:
            c_cand = pl.read_csv(f"{BASE_PATH}"+"chris_session_buy_cands.csv")

    session_cand1 = c_cand.filter(pl.col("session").is_in(local_sessions))
    del c_cand
    session_cand_rank1 = (session_cand1.with_row_count()
                                      .groupby("session")
                                      .agg(pl.col("row_nr").rank().alias("cand_rank"))
                                      .explode("cand_rank").sort("session")
                         )
    session_cand1 = session_cand1.with_columns(pl.Series(session_cand_rank1["cand_rank"]).alias("cand_rank"))
    session_cand1 = adjustment(session_cand1, 0)
    del session_cand_rank1
    
    ##### candidate2: another covisitation (val ver 3, full ver 3)#####
    if type_ == "clicks":
        if tr_or_test == "train":
            c_cand = pl.read_csv("../input/otto-valid-new-candidates/chris_session_click_train_cands.csv")
            if not VALID:
                add_c_cand = pl.read_csv("../input/otto-new-candidates/chris_session_click_train_cands.csv")
                c_cand = pl.concat([c_cand, add_c_cand])
                del add_c_cand
        else:
            c_cand = pl.read_csv(f"{NEW_BASE_PATH}"+"chris_session_click_cands.csv")
    elif type_ == "carts":
        if tr_or_test == "train":
            c_cand = pl.read_csv("../input/otto-valid-new-candidates/chris_session_cart_train_cands.csv")
            if not VALID:
                add_c_cand = pl.read_csv("../input/otto-new-candidates/chris_session_cart_train_cands.csv")
                c_cand = pl.concat([c_cand, add_c_cand])
                del add_c_cand
        else:
            c_cand = pl.read_csv(f"{NEW_BASE_PATH}"+"chris_session_cart_cands.csv")
    elif type_ == "orders":
        if tr_or_test == "train":
            c_cand = pl.read_csv("../input/otto-valid-new-candidates/chris_session_order_train_cands.csv")
            if not VALID:
                add_c_cand = pl.read_csv("../input/otto-new-candidates/chris_session_order_train_cands.csv")
                c_cand = pl.concat([c_cand, add_c_cand])
                del add_c_cand
        else:
            c_cand = pl.read_csv(f"{NEW_BASE_PATH}"+"chris_session_order_cands.csv")   

    session_cand3 = c_cand.filter(pl.col("session").is_in(local_sessions))
    session_cand_rank3 = (session_cand3.with_row_count()
                                      .groupby("session").agg(pl.col("row_nr").rank().alias("cand_rank"))
                                      .explode("cand_rank").sort("session")
                         )
    session_cand3 = session_cand3.with_columns(pl.Series(session_cand_rank3["cand_rank"]).alias("cand_rank"))
    session_cand3 = adjustment(session_cand3, 1)
    session_all_cand = pl.concat([session_cand1, session_cand3]) 
    del session_cand1, session_cand3, session_cand_rank3
    session_all_cand = session_all_cand.unique(subset=["session", "aid"], keep="first")
    
    ##### candidate3: (word2vec notebook version 29)#####
    if tr_or_test == "train":
        w2v_cand = pl.read_parquet("/kaggle/input/otto-w2vec-valid-cands/w2vec_train_cands_for_validation.pqt")
        if not VALID:
            add_w2v_cand = pl.read_parquet("/kaggle/input/otto-w2vec-valid-cands/w2vec_train_cands_for_full_train.pqt")
            w2v_cand = pl.concat([w2v_cand, add_w2v_cand])
            del add_w2v_cand
    else:
        if VALID:
            w2v_cand = pl.read_parquet("/kaggle/input/otto-w2vec-valid-cands/w2vec_test_cands_for_validation.pqt")
        else:
            w2v_cand = pl.read_parquet("/kaggle/input/otto-w2vec-valid-cands/w2vec_test_cands_for_full_train.pqt")
        
    session_cand2 = w2v_cand.filter(pl.col("session").is_in(local_sessions))
    del w2v_cand
    session_cand_rank2 = (session_cand2.with_row_count()
                                       .groupby("session")
                                       .agg(pl.col("row_nr").rank().alias("cand_rank"))
                                        .explode("cand_rank").sort("session")
                         )
    session_cand2 = session_cand2.with_columns(pl.Series(session_cand_rank2["cand_rank"]).alias("cand_rank"))
    session_cand2 = adjustment(session_cand2, 2)
    session_all_cand = pl.concat([session_all_cand, session_cand2]) 
    del session_cand2, session_cand_rank2
    
    ##### candidate4: revisit candidate #####
    if tr_or_test == "train":
        revisit_cand = org_train_data.unique(subset=["session", "aid"], keep="last")
    if tr_or_test == "test":
        revisit_cand = org_test_data.unique(subset=["session", "aid"], keep="last")
    revisit_cand = revisit_cand.filter(pl.col("session").is_in(local_sessions))
    
    revisit_cand_rank = (revisit_cand.groupby("session")
                                     .agg(pl.col("ts").rank(reverse=True).alias("cand_rank"))
                                     .explode("cand_rank").sort("session")
                        )
    
    revisit_cand = revisit_cand.with_columns(pl.Series(revisit_cand_rank["cand_rank"]).alias("cand_rank"))
    revisit_cand = revisit_cand.filter(pl.col("cand_rank") <= 20) # only latest 20 items
    revisit_cand = adjustment(revisit_cand, 3)
    session_all_cand = pl.concat([session_all_cand, revisit_cand]) 
    del revisit_cand, revisit_cand_rank
    
    session_all_cand = session_all_cand.unique(subset=["session", "aid"], keep="first")
    gc.collect()
        
    return session_all_cand

In [19]:
def concat_train_data_cands(train_history, tr_label_df, tr_session_all_cand, type_):  
        
    tr_session_all_cand = tr_session_all_cand.with_columns(pl.Series(name='aid', values = tr_session_all_cand['aid'], dtype= pl.Int32))
    tr_session_all_cand = tr_session_all_cand.with_columns(pl.Series(name='session', values = tr_session_all_cand['session'], dtype= pl.Int32))    
    
    if type_ == "clicks":
        train_type_labels = tr_label_df.filter(pl.col("type") == "clicks").select(["session", "aid", "label"])
    elif type_ == "carts":
        train_type_labels = tr_label_df.filter(pl.col("type") == "carts").select(["session", "aid", "label"])
    elif type_ == "orders":
        train_type_labels = tr_label_df.filter(pl.col("type") == "orders").select(["session", "aid", "label"])
        
    all_pos_len = len(train_type_labels)
    print("consider only sessions with positive labels")
    all_sessions = list(tr_session_all_cand.unique(subset=["session"]).select("session").to_numpy().reshape(1,-1)[0])

    tmp = tr_session_all_cand.join(train_type_labels.select(["session", "aid"]), on = ["session", "aid"], how = "inner")
    cons_session = list(tmp.unique(subset=["session"]).select("session").to_numpy().reshape(1,-1)[0])
    del tmp
    
    tr_session_all_cand = tr_session_all_cand.filter(pl.col("session").is_in(cons_session))
    train_type_labels = train_type_labels.filter(pl.col("session").is_in(cons_session))
    tr_session_all_cand = tr_session_all_cand.join(train_type_labels, on = ["session", "aid"], how = "left")
    del train_type_labels
    tr_session_all_cand =tr_session_all_cand.with_columns(pl.col("label").fill_null(0))
    train_df = train_history.filter(pl.col("session").is_in(cons_session))
    
    print("down sampling") #########
    pos_tr_session_all_cand = tr_session_all_cand.filter(pl.col("label")==1)
    hit_pos_len = len(pos_tr_session_all_cand)
    tr_session_all_cand = tr_session_all_cand.filter(pl.col("label")==0).sort("session")
    session_pos_count = pos_tr_session_all_cand.groupby("session").agg(pl.sum("label"))
    session_pos_count = dict(zip(session_pos_count["session"].to_numpy().reshape(1,-1)[0], 
                                 session_pos_count["label"].to_numpy().reshape(1,-1)[0]))
    session_size = tr_session_all_cand["session"].value_counts().sort("session")
    session_size.columns = ["session", "start"]
    session_size = session_size.with_columns(pl.cumsum("start").alias("end"))
    session_size = session_size.with_columns(pl.col("start").shift().fill_null(0))
    session_size = session_size.with_columns(pl.cumsum("start").alias("start"))
    index = []
    for a, b, c in zip(session_size.select("session").to_numpy().reshape(1,-1)[0], 
                       session_size.select("start").to_numpy().reshape(1,-1)[0],
                       session_size.select("end").to_numpy().reshape(1,-1)[0]):
        neg_num = session_pos_count[a] * 20
        if len(np.arange(b, c)) >= neg_num:
            index += list(np.random.choice(np.arange(b, c), neg_num, replace=False))
        else:
            index += list(np.arange(b, c))
                    
    tr_session_all_cand = tr_session_all_cand.with_row_count().filter(pl.col("row_nr").is_in(index)).drop("row_nr")   
    tr_session_all_cand = pl.concat([pos_tr_session_all_cand, tr_session_all_cand])
    del pos_tr_session_all_cand, session_pos_count, session_size
    
    print("candidate concat")
    del cons_session
    train_df = train_df.join(tr_session_all_cand, on = "session", how = "inner")
    del tr_session_all_cand
    print(type_, train_df.select(pl.mean(target)).to_numpy(), train_df.unique(subset=["session"]).select("session").shape[0])
    
    print("recall: ", hit_pos_len / all_pos_len)
    
    return train_df

def concat_test_data_cands(test_history, te_session_all_cand):      
    # candidates
    te_session_all_cand = te_session_all_cand.with_columns(te_session_all_cand["aid"].alias("aid").cast(pl.Int32))
    te_session_all_cand = te_session_all_cand.with_columns(te_session_all_cand["session"].alias("session").cast(pl.Int32))
    
    # data
    test_df = test_history.join(te_session_all_cand, on = "session", how="inner")

    return test_df

In [20]:
#####################
def fe_train_data(train_df, final_cols ,session_features, cand_feature_dfs, append_data):  
    if not append_data is None:
        train_df = train_df.join(append_data, on = ["session", "aid"], how="left")
    train_df = add_candidate_features(train_df, cand_feature_dfs)
    
    train_df = train_df.sort("session")
    session_lengths = train_df.select("session").to_numpy().reshape(1,-1)[0]
    y_train = train_df.select(target).to_numpy().reshape(1,-1)[0]

    train_frame = train_df.select(["session", "aid"]).to_pandas()
    train_df = train_df.select(final_cols).to_pandas()
    
    return train_df, y_train, train_frame, session_lengths

#####################
def fe_test_data(test_df, final_cols ,session_features, cand_feature_dfs, append_data, cons_test_session_no):        
    if not append_data is None:        
        test_df = test_df.join(append_data.filter(pl.col("session").is_in(cons_test_session_no)), on = ["session", "aid"], how="left")
    test_df = add_candidate_features(test_df, cand_feature_dfs)
    
    test_df = test_df.sort("session")

    test_frame = test_df.select(["session", "aid"]).to_pandas()
    test_df = test_df.select(final_cols).to_pandas()

    return test_df, test_frame

def prepare_candidate_features(session_features, cand_features):
    # otto-fe-notebook: version 28
    cons_cols = ["aid"] + cand_features
    if VALID:
        aid_overall_feats = pl.read_parquet("/kaggle/input/ottofeatures/overall_aid_uu_feats-for-valid.pqt")
    else:
        aid_overall_feats = pl.read_parquet('/kaggle/input/ottofeatures/overall_aid_uu_feats-for-full-train.pqt')
    aid_overall_feats_cols = set(aid_overall_feats.columns).intersection(set(cons_cols))
    aid_overall_feats = aid_overall_feats.select(aid_overall_feats_cols)
    aid_overall_feats = aid_overall_feats.with_columns(aid_overall_feats["aid"].alias("aid").cast(pl.Int32))

    #
    week_aid_result = pl.read_parquet('/kaggle/input/ottofeatures/week_aid_result.pqt')
    week_aid_result = week_aid_result.with_columns(week_aid_result["aid"].alias("aid").cast(pl.Int32))
    week_aid_result = week_aid_result.with_columns(pl.Series(week_aid_result["week"] - 1).alias("week").cast(pl.Int8))
    cons_columns = [i for i in week_aid_result.columns if i in ["week", "session", "aid"] + session_features + cand_features]
    week_aid_result = week_aid_result.select(cons_columns)
    
    #
    aid_weekly_stats = pl.read_parquet('/kaggle/input/ottofeatures/aid_weekly_stats.pqt')
    aid_weekly_stats = aid_weekly_stats.with_columns(aid_weekly_stats["aid"].alias("aid").cast(pl.Int32))
    aid_weekly_stats = aid_weekly_stats.with_columns(pl.Series(aid_weekly_stats["week"] - 1).alias("week").cast(pl.Int8))
    cons_columns = [i for i in aid_weekly_stats.columns if i in ["week", "session", "aid"] + session_features + cand_features] 
    aid_weekly_stats = aid_weekly_stats.select(cons_columns) 
    
    #
    aid_weekly_section_uu = pl.read_parquet('/kaggle/input/ottofeatures/week_aid_uu_count_in_list.pqt')
    aid_weekly_section_uu = aid_weekly_section_uu.with_columns(aid_weekly_section_uu["aid"].alias("aid").cast(pl.Int32))
    aid_weekly_section_uu = aid_weekly_section_uu.with_columns(pl.Series(aid_weekly_section_uu["week"] - 1).alias("week").cast(pl.Int8))
    cons_columns = [i for i in aid_weekly_section_uu.columns if i in ["week", "session", "aid"] + session_features + cand_features] 
    aid_weekly_section_uu = aid_weekly_section_uu.select(cons_columns)
    
    #
    aid_weekly_section_uu_stats = pl.read_parquet('/kaggle/input/ottofeatures/aid_weekly_stats_uu_list.pqt') 
    aid_weekly_section_uu_stats = aid_weekly_section_uu_stats.with_columns(aid_weekly_section_uu_stats["aid"].alias("aid").cast(pl.Int32))
    aid_weekly_section_uu_stats = aid_weekly_section_uu_stats.with_columns(pl.Series(aid_weekly_section_uu_stats["week"] - 1).alias("week").cast(pl.Int8))
    cons_columns = [i for i in aid_weekly_section_uu_stats.columns if i in ["week", "session", "aid"] + session_features + cand_features] 
    aid_weekly_section_uu_stats = aid_weekly_section_uu_stats.select(cons_columns)
    
    return [aid_overall_feats, week_aid_result, aid_weekly_stats, aid_weekly_section_uu, aid_weekly_section_uu_stats]

def add_candidate_features(final_df, cand_feature_dfs):
    aid_overall_feats, week_aid_result, aid_weekly_stats, aid_weekly_section_uu, aid_weekly_section_uu_stats = cand_feature_dfs
    final_df = final_df.join(aid_overall_feats, on = ["aid"], how = "left")
    
    final_df = final_df.with_columns(pl.Series(final_df["session"].apply(lambda x: week_session[x])).alias("week").cast(pl.Int8))
    
    final_df = final_df.join(week_aid_result, on = ["week", "aid"], how = "left")
    final_df = final_df.join(aid_weekly_stats, on = ["week", "aid"], how = "left") 
    final_df = final_df.join(aid_weekly_section_uu, on = ["week", "aid"], how = "left")
    final_df = final_df.join(aid_weekly_section_uu_stats, on = ["week", "aid"], how = "left") 
    final_df = final_df.drop(["week"])
    
    # session * aid features
    final_df = final_df.join(session_aid_count, on = ["session", "aid"], how = "left")
    for col in ["type0", "type1", "type2"]:
        final_df = final_df.with_columns(pl.col(col).fill_null(0).alias(col).cast(pl.Int16))
        
    final_df = final_df.join(session_weighted_count, on = ["session", "aid"], how = "left")
    for col in ["w_type0", "w_type1", "w_type2"]:
        final_df = final_df.with_columns(pl.col(col).fill_null(0).alias(col).cast(pl.Float32))
    
    return final_df

In [21]:
######
def learning_and_prediction(type_, train_session, test_session, final_cols, session_features, cand_features, cat_cols,
                            test_batch_size, append_data = [None, None], keep_pred_score=False):
    test = []
    test_pred = []
    
    cand_feature_dfs = prepare_candidate_features(session_features, cand_features)
    
    X_train, y_train, train, session_lengths = arrange_train_data(type_, train_session, final_cols,
                                                                  session_features, cand_feature_dfs, append_data[0])
    
    # ranker #####
    if type_ in ["clicks", "carts"]:
        cat_params = {
        'loss_function': 'YetiRank', #'PairLogitPairwise:max_pairs=20',
        'depth': 15,
        'iterations': 100,
        'learning_rate': 0.02,
        'random_seed': 0,
        'grow_policy': "SymmetricTree",
        "reg_lambda": 0.01,
        'score_function': "Cosine", 
        }
    elif type_ in ["orders"]:
        cat_params = {
        'loss_function': 'YetiRank',  #'PairLogitPairwise:max_pairs=20',
        'depth': 15,
        'iterations': 100,
        'learning_rate': 0.08,
        'random_seed': 0,
        'grow_policy': "SymmetricTree",
        "reg_lambda": 0.01,
        'score_function': "Cosine", 
        }
    train_pool = Pool(X_train, group_id=session_lengths, label=y_train, cat_features=cat_cols) 
    model = CatBoostRanker(**cat_params)
    model = model.fit(train_pool)
    
    if keep_pred_score:
        train["score"] = model.predict(X_train)
        train["score"] = train["score"].astype("float32")
        train = pl.from_pandas(train)
    else:
        del train
        train = None
        gc.collect()
    del X_train, y_train
    gc.collect()
    
    print(" ")
    print("#####feature importance#####")
    importance_values = model.get_feature_importance(train_pool, type="PredictionValuesChange")
    for feat_rank, i in enumerate(np.argsort(importance_values)[::-1]):
        if importance_values[i] > 0:
            print(feat_rank + 1, importance_values[i] / np.sum(importance_values), final_cols[i])
        else:
            print(importance_values[i] / np.sum(importance_values), final_cols[i])
            
    print(" ")
    print('#####prediction#####')

    for i in tqdm(range(0, len(test_session_no), test_batch_size)):
        X_test, mini_test = arrange_test_data(type_, test_session, final_cols, 
                                              session_features, cand_feature_dfs, i, i + test_batch_size, append_data[1])
        mini_test["score"] = model.predict(X_test)
        mini_test = pl.from_pandas(mini_test)
        if keep_pred_score:
            test.append(mini_test) # session, aid, pred_score
        mini_test = mini_test.sort(["session", "score"], reverse=[False, True])
        mini_test = mini_test.groupby('session').head(20)
        test_pred.append(mini_test)
        del mini_test, X_test 
        gc.collect()
                        
    del model
    gc.collect()
    
    test_pred = pl.concat(test_pred)
    test_pred = test_pred.groupby("session").agg(pl.col("aid").apply(list))
    test_pred = test_pred.with_columns((test_pred["session"].cast(str) + "_" + str(type_)).alias("session"))
    test_pred = test_pred.with_columns((test_pred["aid"].apply(lambda x: ' '.join(str(p) for p in x))).alias("aid"))
    
    if keep_pred_score:
        test = pl.concat(test)
        test = test.with_columns(test["score"].alias("score").cast(pl.Float32))
    else:
        test = None
        
    return test_pred, train, test

# execution

In [22]:
target = 'label'

In [23]:
# click
cand_features = ["cand_type", "cand_rank", 
                 #
                 'prev_w_click_counts',  'prev_w_click_uus', 'prev_w_click_count_rank', 
                 'prev_w_click_uu_rank', 'prev_w_click_ratio',
                 #
                'prev_w_click_counts_std', 'prev_w_click_counts_mean', 'prev_w_click_counts_max', 'prev_w_click_counts_min',
                'prev_w_click_uus_std', 'prev_w_click_uus_mean', 'prev_w_click_uus_max', 'prev_w_click_uus_min', 
                 # 
                 "prev_w_click_only_ratio_std", "prev_w_click_only_ratio_mean", 
                 # 
                 "prev_w_click_only", 'prev_w_click_only_std', 'prev_w_click_only_mean', 
                'prev_w_click_count_rank_std', 'prev_w_click_count_rank_mean', 'prev_w_click_count_rank_max', 'prev_w_click_count_rank_min',
                'prev_w_click_uu_rank_std', 'prev_w_click_uu_rank_mean','prev_w_click_uu_rank_max', 'prev_w_click_uu_rank_min',
                 'prev_w_click_cart_std', 'prev_w_click_cart_mean', 'prev_w_click_cart_max', 'prev_w_click_cart_min',
                 "prev_w_click_cart_ratio_std", "prev_w_click_cart_ratio_mean", "prev_w_click_cart_ratio_max", "prev_w_click_cart_ratio_min",
                 "prev_w_click_order_ratio_std", "prev_w_click_order_ratio_mean", "prev_w_click_order_ratio_max", "prev_w_click_order_ratio_min",
                 ] 

final_cols = session_features + cand_features
cat_col = ["cand_type", ]

click_test_pred, train_click_score, test_click_score = learning_and_prediction("clicks", train_session, test_session, 
                                                                               final_cols, session_features, cand_features, cat_col, 
                                                                               test_batch_size = 75000, 
                                                                               append_data = [None, None], keep_pred_score=True)

#####make candidates#####
shape: (41, 2)
┌───────────────┬────────┐
│ session_cands ┆ counts │
│ ---           ┆ ---    │
│ u32           ┆ u32    │
╞═══════════════╪════════╡
│ 61            ┆ 2991   │
│ 62            ┆ 6539   │
│ 63            ┆ 11262  │
│ 64            ┆ 15164  │
│ ...           ┆ ...    │
│ 98            ┆ 10     │
│ 99            ┆ 5      │
│ 100           ┆ 3      │
│ 102           ┆ 1      │
└───────────────┴────────┘
#####concat data and cands#####
consider only sessions with positive labels
down sampling
candidate concat
clicks [[0.04761905]] 947946
recall:  0.651893245712423
shape: (1, 2)
┌───────────────┬────────┐
│ session_cands ┆ counts │
│ ---           ┆ ---    │
│ u32           ┆ u32    │
╞═══════════════╪════════╡
│ 21            ┆ 947946 │
└───────────────┴────────┘
#####feature engineering#####
Groupwise loss function. OneHotMaxSize set to 10
0:	total: 49.4s	remaining: 1h 21m 28s
1:	total: 1m 38s	remaining: 1h 20m 46s
2:	total: 2m 28s	remaining: 1h 1

  0%|          | 0/25 [00:00<?, ?it/s]

In [24]:
# cart
cand_features = ["cand_type", "score", "cand_rank", "type0", "type1", "type2", "w_type0", "w_type1", "w_type2",
                 #
                 'prev_w_click_counts', 'prev_w_cart_counts', 'prev_w_click_uus', 'prev_w_cart_uus',
                'prev_w_click_count_rank', 'prev_w_cart_count_rank',
                'prev_w_click_uu_rank','prev_w_cart_uu_rank', 'prev_w_click_ratio', 'prev_w_cart_ratio', 
                 #
                'prev_w_click_counts_std', 'prev_w_click_counts_mean', 'prev_w_click_counts_max', 'prev_w_click_counts_min',
                'prev_w_cart_counts_std', 'prev_w_cart_counts_mean', 'prev_w_cart_counts_max', 'prev_w_cart_counts_min',
                'prev_w_click_uus_std', 'prev_w_click_uus_mean', 'prev_w_click_uus_max','prev_w_click_uus_min', 
                'prev_w_cart_uus_std', 'prev_w_cart_uus_mean','prev_w_cart_uus_max', 'prev_w_cart_uus_min', 
                 #
                 "prev_w_click_only", "prev_w_cart_only", "prev_w_order_only", "prev_w_click_cart", "prev_w_cart_order", "prev_w_click_cart_order",
                 # 
                 'prev_w_carts_clicks_count_ratio', 'prev_w_orders_clicks_count_ratio',
                 'prev_w_carts_clicks_uu_ratio', 'prev_w_orders_clicks_uu_ratio',
                 'prev_w_click_uu_ratio', 'prev_w_cart_uu_ratio', 'prev_w_order_uu_ratio',
                 'prev_w_click_count_rank_std', 'prev_w_click_count_rank_mean', 'prev_w_click_count_rank_max', 'prev_w_click_count_rank_min',
                 'prev_w_cart_count_rank_std', 'prev_w_cart_count_rank_mean','prev_w_cart_count_rank_max', 'prev_w_cart_count_rank_min',
                 'prev_w_order_count_rank_std', 'prev_w_order_count_rank_mean','prev_w_order_count_rank_max', 'prev_w_order_count_rank_min',
                 'prev_w_click_uu_rank_std', 'prev_w_click_uu_rank_mean','prev_w_click_uu_rank_max', 'prev_w_click_uu_rank_min',
                 'prev_w_cart_uu_rank_std', 'prev_w_cart_uu_rank_mean','prev_w_cart_uu_rank_max', 'prev_w_cart_uu_rank_min',
                 'prev_w_order_uu_rank_std', 'prev_w_order_uu_rank_mean','prev_w_order_uu_rank_max', 'prev_w_order_uu_rank_min',
                 # 
                'prev_w_click_only_std', 'prev_w_click_only_mean', 'prev_w_click_only_max', 'prev_w_click_only_min',
                'prev_w_cart_only_std', 'prev_w_cart_only_mean', 'prev_w_cart_only_max', 'prev_w_cart_only_min',
                 'prev_w_order_only_std', 'prev_w_order_only_mean', 'prev_w_order_only_max', 'prev_w_order_only_min', 
                 'prev_w_click_cart_std', 'prev_w_click_cart_mean', 'prev_w_click_cart_max', 'prev_w_click_cart_min',
                 'prev_w_cart_order_std', 'prev_w_cart_order_mean', 'prev_w_cart_order_max', 'prev_w_cart_order_min', 
                 'prev_w_click_order_std', 'prev_w_click_order_mean', 'prev_w_click_order_max', 'prev_w_click_order_min',
                 'prev_w_click_cart_order_std', 'prev_w_click_cart_order_mean', 'prev_w_click_cart_order_max', 'prev_w_click_cart_order_min',
                 #
                 "prev_w_click_only_ratio_std", "prev_w_click_only_ratio_mean", "prev_w_click_only_ratio_max", "prev_w_click_only_ratio_min",
                 "prev_w_cart_only_ratio_std", "prev_w_cart_only_ratio_mean", "prev_w_cart_only_ratio_max", "prev_w_cart_only_ratio_min",
                 "prev_w_click_cart_ratio_std", "prev_w_click_cart_ratio_mean", "prev_w_click_cart_ratio_max", "prev_w_click_cart_ratio_min",
                 "prev_w_click_order",
                 # check
                 "prev_w_cart_order_ratio_std", "prev_w_cart_order_ratio_mean", "prev_w_cart_order_ratio_max", "prev_w_cart_order_ratio_min",
                ]

session_features = [] 
final_cols = session_features + cand_features 
cat_col = ["cand_type", ]

cart_test_pred, _, _ = learning_and_prediction("carts", train_session, test_session, 
                                               final_cols, session_features, cand_features, cat_col,
                                               test_batch_size = 75000, 
                                               append_data = [train_click_score, test_click_score], keep_pred_score=False)

#####make candidates#####
shape: (48, 2)
┌───────────────┬────────┐
│ session_cands ┆ counts │
│ ---           ┆ ---    │
│ u32           ┆ u32    │
╞═══════════════╪════════╡
│ 61            ┆ 17720  │
│ 62            ┆ 29486  │
│ 63            ┆ 45690  │
│ 64            ┆ 52233  │
│ ...           ┆ ...    │
│ 105           ┆ 4      │
│ 106           ┆ 5      │
│ 107           ┆ 4      │
│ 116           ┆ 1      │
└───────────────┴────────┘
#####concat data and cands#####
consider only sessions with positive labels
down sampling
candidate concat
carts [[0.04862506]] 207099
recall:  0.5739250999858648
shape: (41, 2)
┌───────────────┬────────┐
│ session_cands ┆ counts │
│ ---           ┆ ---    │
│ u32           ┆ u32    │
╞═══════════════╪════════╡
│ 21            ┆ 162601 │
│ 42            ┆ 30238  │
│ 61            ┆ 44     │
│ 62            ┆ 108    │
│ ...           ┆ ...    │
│ 96            ┆ 9      │
│ 97            ┆ 2      │
│ 98            ┆ 3      │
│ 99            ┆ 1      

  0%|          | 0/25 [00:00<?, ?it/s]

In [25]:
# order
cand_features = ["cand_type", "score", "cand_rank", "type0", "type1", "type2",
                 #
                  'prev_w_click_counts', 'prev_w_cart_counts', 'prev_w_order_counts', 
                 'prev_w_click_uus', 'prev_w_cart_uus', 'prev_w_order_uus', 
                 'prev_w_click_count_rank', 'prev_w_cart_count_rank', 'prev_w_order_count_rank', 
                 'prev_w_click_uu_rank', 'prev_w_cart_uu_rank', 'prev_w_order_uu_rank', 
                 'prev_w_click_ratio', 'prev_w_cart_ratio', 'prev_w_order_ratio',
                 #
                 'prev_w_click_counts_std', 'prev_w_click_counts_mean', 'prev_w_click_counts_max', 'prev_w_click_counts_min',
                'prev_w_cart_counts_std', 'prev_w_cart_counts_mean', 'prev_w_cart_counts_max', 'prev_w_cart_counts_min',
                'prev_w_order_counts_std', 'prev_w_order_counts_mean', 'prev_w_order_counts_max', 'prev_w_order_counts_min',
                'prev_w_click_uus_std', 'prev_w_click_uus_mean', 'prev_w_click_uus_max','prev_w_click_uus_min',
                'prev_w_cart_uus_std', 'prev_w_cart_uus_mean', 'prev_w_cart_uus_max', 'prev_w_cart_uus_min', 
                'prev_w_order_uus_std', 'prev_w_order_uus_mean', 'prev_w_order_uus_max', 'prev_w_order_uus_min',
                 "prev_w_click_only", "prev_w_order_only", 
                 # 
                'prev_w_order_uu_rank_std', 'prev_w_order_uu_rank_mean','prev_w_order_uu_rank_max', 'prev_w_order_uu_rank_min',
                'prev_w_order_count_rank_std', 'prev_w_order_count_rank_mean','prev_w_order_count_rank_max', 'prev_w_order_count_rank_min',
                'clicks_uu_count','orders_uu_count', 'carts_uu_count', 'clicks_uu_rank', 'orders_uu_rank', 'carts_uu_rank', 
                'prev_w_click_uu_ratio', 'prev_w_cart_uu_ratio', 'prev_w_order_uu_ratio',
                'prev_w_click_count_rank_std', 'prev_w_click_count_rank_mean', 'prev_w_click_count_rank_max', 'prev_w_click_count_rank_min',
                'prev_w_cart_count_rank_std', 'prev_w_cart_count_rank_mean','prev_w_cart_count_rank_max', 'prev_w_cart_count_rank_min',
                'prev_w_click_uu_rank_std', 'prev_w_click_uu_rank_mean','prev_w_click_uu_rank_max', 'prev_w_click_uu_rank_min',
                'prev_w_cart_uu_rank_std', 'prev_w_cart_uu_rank_mean','prev_w_cart_uu_rank_max', 'prev_w_cart_uu_rank_min',
                 # 
                 "prev_w_click_only_ratio_std", "prev_w_click_only_ratio_mean", 
                 "prev_w_order_only_ratio_std", "prev_w_order_only_ratio_mean", 
                 # 
                 'prev_w_click_only_std', 'prev_w_click_only_mean', 'prev_w_click_only_max', 'prev_w_click_only_min',
                'prev_w_cart_only_std', 'prev_w_cart_only_mean', 'prev_w_cart_only_max', 'prev_w_cart_only_min',
                 'prev_w_order_only_std', 'prev_w_order_only_mean', 'prev_w_order_only_max', 'prev_w_order_only_min', 
                 "prev_w_cart_order_ratio_std", "prev_w_cart_order_ratio_mean", 
                 # 
                 "prev_w_cart_only_ratio_std", "prev_w_cart_only_ratio_mean", 
                 # check
                 ] 

session_features = [] 
final_cols = session_features + cand_features 
cat_col = ["cand_type", ]

order_test_pred, _, _ = learning_and_prediction("orders", train_session, test_session, 
                                                final_cols, session_features, cand_features, cat_col,
                                                test_batch_size = 50000, 
                                                append_data = [train_click_score, test_click_score], keep_pred_score=False) 

#####make candidates#####
shape: (41, 2)
┌───────────────┬────────┐
│ session_cands ┆ counts │
│ ---           ┆ ---    │
│ u32           ┆ u32    │
╞═══════════════╪════════╡
│ 61            ┆ 14855  │
│ 62            ┆ 24021  │
│ 63            ┆ 38721  │
│ 64            ┆ 43917  │
│ ...           ┆ ...    │
│ 98            ┆ 30     │
│ 99            ┆ 8      │
│ 100           ┆ 3      │
│ 101           ┆ 1      │
└───────────────┴────────┘
#####concat data and cands#####
consider only sessions with positive labels
down sampling
candidate concat
orders [[0.05197636]] 120518
recall:  0.765295551265903
shape: (40, 2)
┌───────────────┬────────┐
│ session_cands ┆ counts │
│ ---           ┆ ---    │
│ u32           ┆ u32    │
╞═══════════════╪════════╡
│ 21            ┆ 80165  │
│ 42            ┆ 21561  │
│ 61            ┆ 1      │
│ 62            ┆ 21     │
│ ...           ┆ ...    │
│ 95            ┆ 16     │
│ 96            ┆ 16     │
│ 97            ┆ 7      │
│ 98            ┆ 4      

  0%|          | 0/37 [00:00<?, ?it/s]

In [26]:
del train_labels, train_session, test_session
del train_click_score, test_click_score
gc.collect()

21

# create submission csv

In [27]:
# clicks, orders, carts 
pred_df = pl.concat([click_test_pred, cart_test_pred, order_test_pred]).to_pandas()
pred_df.columns = ["session_type", "labels"]
pred_df.to_csv("submission.csv", index=False)
pred_df.head()

Unnamed: 0,session_type,labels
0,12808048_clicks,1504298 683401 250637 1217083 406829 811767 88...
1,11177152_clicks,733687 1010837 1594312 794153 467171 885958 67...
2,12545812_clicks,1619255 1554167 896162 1379999 713211 100798 2...
3,12838324_clicks,755095 1341811 1550265 555453 1252772 174249 4...
4,11136224_clicks,568052 1143350 1599919 534567 1318163 260013 7...


# validation

In [28]:
if VALID:        
    # COMPUTE METRIC
    score = 0
    weights = {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60}
    for t in ['clicks','carts','orders']: #
        sub = pred_df.loc[pred_df.session_type.str.contains(t)].copy()
        sub['session'] = sub.session_type.apply(lambda x: int(x.split('_')[0]))
        sub.labels = sub.labels.apply(lambda x: [int(i) for i in x.split(' ')[:20]])
        test_labels = pd.read_parquet('../input/otto-validation/test_labels.parquet')
        test_labels = test_labels.loc[test_labels['type']==t]
        test_labels = test_labels.merge(sub, how='left', on=['session'])
        test_labels['hits'] = test_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)
        test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0,20)
        recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
        score += weights[t]*recall
        print(f'{t} recall =',recall)
    
    print('=============')
    print('Overall Recall =',score)
    print('=============')

clicks recall = 0.5291102308471383
carts recall = 0.4172584746791747
orders recall = 0.6544080331180997
Overall Recall = 0.570733385359326
