# ensemble003

In [8]:
use_exps = ["exp062", "exp063", "exp066"]

In [9]:
import os
import sys
import traceback
import gc
import random
import pickle
import pathlib
import subprocess
from dataclasses import dataclass
from dotenv import load_dotenv
load_dotenv
sys.path.append(os.getenv('UTILS_PATH'))
from tqdm import tqdm
import multiprocessing
import inspect

import pandas as pd
import polars as pl
import numpy as np
import itertools
import cudf
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
import line_notify
import my_logger
from noglobal import noglobal

# 設定

In [10]:
@dataclass
class Cfg:
    loglevel = "INFO"
    exp_name = "ensemble003"
    run_inf = True
    seed = 42
    k = 20
    cand_n = 15
    negative_sample = 1
    train_chunk_n_dict = {"clicks":2, "carts":1, "orders":1}
    test_chunk_n = 5
    type2id = {"clicks":0, "carts":1, "orders":2}
    id2type = {0:"clicks", 1:"carts", 2:"orders"}
    train_week = "week3"
    valid_week = "week4"
    valid_session_n = 100_000
    input_dir = os.getenv('INPUT_DIR')
    output_dir = os.getenv('OUTPUT_DIR')
    prep_dir = os.getenv("PREP_DIR")

    clicks_params = {'objective': 'binary', 'boosting': 'gbdt', 'learning_rate': 0.1, 'metric': 'binary_logloss', 'seed': 42, 'feature_pre_filter': False, 'lambda_l1': 5.485903737168179, 'lambda_l2': 0.005594683492536064, 'num_leaves': 79, 'feature_fraction': 0.552, 'bagging_fraction': 0.9295272232672004, 'bagging_freq': 2, 'min_child_samples': 10}
    carts_params = {'objective': 'binary', 'boosting': 'gbdt', 'learning_rate': 0.1, 'metric': 'binary_logloss', 'seed': 42, 'feature_pre_filter': False, 'lambda_l1': 8.709050252544463, 'lambda_l2': 0.06935262036337767, 'num_leaves': 252, 'feature_fraction': 0.4, 'bagging_fraction': 1.0, 'bagging_freq': 0, 'min_child_samples': 5}
    orders_params = {'objective': 'binary', 'boosting': 'gbdt', 'learning_rate': 0.1, 'metric': 'binary_logloss', 'seed': 42, 'feature_pre_filter': False, 'lambda_l1': 9.356310279757256, 'lambda_l2': 1.3120983078968551e-08, 'num_leaves': 174, 'feature_fraction': 0.5, 'bagging_fraction': 1.0, 'bagging_freq': 0, 'min_child_samples': 20}

cfg = Cfg()
os.makedirs(os.path.join(cfg.output_dir, cfg.exp_name), exist_ok=True)
os.makedirs(os.path.join(cfg.output_dir, cfg.exp_name, "cache"), exist_ok=True)
random.seed(cfg.seed)

logger = my_logger.init_logger(cfg.exp_name)

In [11]:
@noglobal(excepts=["cfg", "logger"])
def evaluate(clicks_labels, carts_labels, orders_labels, 
             clicks_preds, carts_preds, orders_preds, k=20):

    num_clicks = 0
    num_carts = 0
    num_orders = 0
    hit_clicks = 0
    hit_carts = 0
    hit_orders = 0

    for i in range(len(clicks_labels)):
        clicks_label = clicks_labels[i]
        carts_label = carts_labels[i]
        orders_label = orders_labels[i]
        clicks_pred = clicks_preds[i]
        carts_pred = carts_preds[i]
        orders_pred = orders_preds[i]

        if type(clicks_pred) == list:
            clicks_pred = clicks_pred[:k]
        else:
            clicks_pred = []
        if type(carts_pred) == list:
            carts_pred = carts_pred[:k]
        else:
            carts_pred = []    
        if type(orders_pred) == list:
            orders_pred = orders_pred[:k]
        else:
            orders_pred = []

        if not np.isnan(clicks_label):
            num_clicks += 1
            hit_clicks += int(clicks_label in clicks_pred)

        if type(carts_label) == np.ndarray:
            num_carts += min(len(carts_label), k)
            hit_carts += len(set(carts_pred) & set(carts_label))
            
        if type(orders_label) == np.ndarray:
            num_orders += min(len(orders_label), k)
            hit_orders += len(set(orders_pred) & set(orders_label))


    recall_clicks = hit_clicks / num_clicks
    recall_carts = hit_carts / num_carts
    recall_orders = hit_orders / num_orders
    w_recall_clicks = recall_clicks * 0.10
    w_recall_carts = recall_carts * 0.30
    w_recall_orders = recall_orders * 0.60
    score = w_recall_clicks + w_recall_carts + w_recall_orders

    results = {}
    results["num_clicks"] = num_clicks
    results["hit_clicks"] = hit_clicks
    results["num_carts"] = num_carts
    results["hit_carts"] = hit_carts
    results["num_orders"] = num_orders
    results["hit_orders"] = hit_orders
    results["recall_clicks"] = format(recall_clicks, ".6f")
    results["recall_carts"] = format(recall_carts, ".6f")
    results["recall_orders"] = format(recall_orders, ".6f")
    results["w_recall_clicks"] = format(w_recall_clicks, ".6f")
    results["w_recall_carts"] = format(w_recall_carts, ".6f")
    results["w_recall_orders"] = format(w_recall_orders, ".6f")
    results["score"] = format(score, ".6f")

    return results

In [12]:
def cust_blend(dt, W = [1,1,1,1], base= 3):   
    REC = []
    for i in range(len(W)):
        REC.append(dt[f'labels{i}'].split())

    res = {}
    for M in range(len(REC)):
        for n, v in enumerate(REC[M]):
            if v in res:
                res[v] += (W[M]/(n+base))
            else:
                res[v] = (W[M]/(n+base))
    
    res = list(dict(sorted(res.items(), key=lambda item: -item[1])).keys())
    
    return ' '.join(res[:30])

In [13]:
def best_weight_search(type_, w_choice=[0, 1, 2, 3]):
    best_weights = None
    best_score = 0.0
    labels = pd.read_parquet(cfg.output_dir + f"{use_exps[0]}/cache/valid_labels.parquet")
    for ws in itertools.product(w_choice, repeat=len(use_exps)):
        if sum(ws) == 0:
            continue
        weights = [w for w in ws]
        for i, exp in enumerate(use_exps):
            if i == 0:
                vl_pred = pd.read_csv(cfg.output_dir + f"{exp}/{type_}_vl_pred_df.csv")
                vl_pred.columns = ['session_type', 'labels0']
            else:
                vl_pred[f'labels{i}'] = pd.read_csv(cfg.output_dir + f"{exp}/{type_}_vl_pred_df.csv")['labels']
        vl_pred['labels'] = vl_pred.apply(cust_blend, W = weights, axis=1)
        vl_pred["labels"] = vl_pred["labels"].apply(lambda x: [int(a) for a in x.split()])

        score = evaluate(labels["clicks_labels"].tolist(),
                         labels["carts_labels"].tolist(),
                         labels["orders_labels"].tolist(),
                         vl_pred["labels"].tolist(),
                         vl_pred["labels"].tolist(),
                         vl_pred["labels"].tolist(),
                        cfg.k)[f"recall_{type_}"]
        score = float(score)

        text = f"{str(weights)} : {str(score)}"
        
        if score > best_score:
            best_score = score
            best_weights = weights 
            text = text + " SCORE UPDATE!!"
        print(text)
    return best_weights, best_score

In [14]:
best_weights_dict = {}
best_score_dict = {}
for type_ in ["clicks", "carts", "orders"]:
    print(type_)
    type_best_weights, type_best_score = best_weight_search(type_)
    best_weights_dict[type_] = type_best_weights
    best_score_dict[type_] = type_best_score

clicks
[0, 0, 1] : 0.510665 SCORE UPDATE!!
[0, 0, 2] : 0.510665
[0, 0, 3] : 0.510665
[0, 1, 0] : 0.509204
[0, 1, 1] : 0.509708
[0, 1, 2] : 0.51011
[0, 1, 3] : 0.510202
[0, 2, 0] : 0.509204
[0, 2, 1] : 0.50978
[0, 2, 2] : 0.509708
[0, 2, 3] : 0.510089
[0, 3, 0] : 0.509204
[0, 3, 1] : 0.509718
[0, 3, 2] : 0.509677
[0, 3, 3] : 0.509708
[1, 0, 0] : 0.508534
[1, 0, 1] : 0.509801
[1, 0, 2] : 0.51011
[1, 0, 3] : 0.510295
[1, 1, 0] : 0.509142
[1, 1, 1] : 0.509739
[1, 1, 2] : 0.509914
[1, 1, 3] : 0.510027
[1, 2, 0] : 0.509204
[1, 2, 1] : 0.509821
[1, 2, 2] : 0.509852
[1, 2, 3] : 0.510048
[1, 3, 0] : 0.509224
[1, 3, 1] : 0.509749
[1, 3, 2] : 0.509935
[1, 3, 3] : 0.509914
[2, 0, 0] : 0.508534
[2, 0, 1] : 0.509698
[2, 0, 2] : 0.509801
[2, 0, 3] : 0.509852
[2, 1, 0] : 0.508915
[2, 1, 1] : 0.509605
[2, 1, 2] : 0.509718
[2, 1, 3] : 0.509893
[2, 2, 0] : 0.509142
[2, 2, 1] : 0.509677
[2, 2, 2] : 0.509739
[2, 2, 3] : 0.509883
[2, 3, 0] : 0.509276
[2, 3, 1] : 0.509451
[2, 3, 2] : 0.50976
[2, 3, 3] : 0.50

In [15]:
print("exp : ", use_exps)
print("best_weight : ", best_weights_dict)
print("best_score : ", best_score_dict)
print("cv : ", best_score_dict["clicks"]*0.1 + best_score_dict["carts"]*0.3 + best_score_dict["orders"]*0.6)

exp :  ['exp062', 'exp063', 'exp066']
best_weight :  {'clicks': [0, 0, 1], 'carts': [0, 1, 3], 'orders': [1, 2, 3]}
best_score :  {'clicks': 0.510665, 'carts': 0.35933, 'orders': 0.59516}
cv :  0.5159615


# make_sub

In [16]:
dfs = []
for type_ in ["clicks", "carts", "orders"]:
    print(type_)
    weights = best_weights_dict[type_]
    for i, exp in enumerate(use_exps):
        if i == 0:
            type_sub = pd.read_csv(cfg.output_dir + f"{exp}/t88_{exp}_sub_k30.csv")
            type_sub.columns = ['session_type', 'labels0']
        else:
            type_sub[f'labels{i}'] = pd.read_csv(cfg.output_dir + f"{exp}/t88_{exp}_sub_k30.csv")['labels']
    type_sub = type_sub[type_sub["session_type"].str.contains(type_)]
    type_sub['labels'] = type_sub.apply(cust_blend, W = weights, axis=1)
    dfs.append(type_sub[type_sub["session_type"].str.contains(type_)])
sub = pd.concat(dfs).sort_index()
assert all(sub["session_type"] == pd.read_csv(cfg.input_dir + f"sample_submission.csv")["session_type"])
sub.to_csv(cfg.output_dir + f"{cfg.exp_name}/t88_{cfg.exp_name}_sub_k30.csv", index=False)

clicks
carts
orders
