# team_ensemble001

In [4]:
import os
import sys
import traceback
import gc
import random
import pickle
import pathlib
import subprocess
from dataclasses import dataclass
from dotenv import load_dotenv
load_dotenv
sys.path.append(os.getenv('UTILS_PATH'))
from tqdm import tqdm
import multiprocessing
import inspect

import pandas as pd
import polars as pl
import numpy as np
import itertools
import cudf
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
import line_notify
import my_logger
from noglobal import noglobal

# 設定

In [5]:
@dataclass
class Cfg:
    loglevel = "INFO"
    exp_name = "team_ensemble001"
    run_inf = True
    seed = 42
    k = 20
    cand_n = 15
    negative_sample = 1
    train_chunk_n_dict = {"clicks":2, "carts":1, "orders":1}
    test_chunk_n = 5
    type2id = {"clicks":0, "carts":1, "orders":2}
    id2type = {0:"clicks", 1:"carts", 2:"orders"}
    train_week = "week3"
    valid_week = "week4"
    valid_session_n = 100_000
    input_dir = os.getenv('INPUT_DIR')
    output_dir = os.getenv('OUTPUT_DIR')
    prep_dir = os.getenv("PREP_DIR")

    clicks_params = {'objective': 'binary', 'boosting': 'gbdt', 'learning_rate': 0.1, 'metric': 'binary_logloss', 'seed': 42, 'feature_pre_filter': False, 'lambda_l1': 5.485903737168179, 'lambda_l2': 0.005594683492536064, 'num_leaves': 79, 'feature_fraction': 0.552, 'bagging_fraction': 0.9295272232672004, 'bagging_freq': 2, 'min_child_samples': 10}
    carts_params = {'objective': 'binary', 'boosting': 'gbdt', 'learning_rate': 0.1, 'metric': 'binary_logloss', 'seed': 42, 'feature_pre_filter': False, 'lambda_l1': 8.709050252544463, 'lambda_l2': 0.06935262036337767, 'num_leaves': 252, 'feature_fraction': 0.4, 'bagging_fraction': 1.0, 'bagging_freq': 0, 'min_child_samples': 5}
    orders_params = {'objective': 'binary', 'boosting': 'gbdt', 'learning_rate': 0.1, 'metric': 'binary_logloss', 'seed': 42, 'feature_pre_filter': False, 'lambda_l1': 9.356310279757256, 'lambda_l2': 1.3120983078968551e-08, 'num_leaves': 174, 'feature_fraction': 0.5, 'bagging_fraction': 1.0, 'bagging_freq': 0, 'min_child_samples': 20}

cfg = Cfg()
os.makedirs(os.path.join(cfg.output_dir, cfg.exp_name), exist_ok=True)
os.makedirs(os.path.join(cfg.output_dir, cfg.exp_name, "cache"), exist_ok=True)
random.seed(cfg.seed)

logger = my_logger.init_logger(cfg.exp_name)

In [6]:
input_sub_path = [
    "/mnt/otto-recommender-system/subs/deho_catboost_ensemble_002.csv", 
    cfg.output_dir + f"exp062/t88_exp062_sub_k30.csv",
    "/mnt/otto-recommender-system/subs/chi_exp400.csv",
    "/mnt/otto-recommender-system/subs/zako_030_lgbm_rank_stacking_sub_30.csv"
]
weights = [2.0, 1.0, 2.0, 0.5]

In [7]:
for i,path in enumerate(input_sub_path):
    if i == 0:
        sub = pd.read_csv(path).sort_values('session_type').reset_index(drop=True)
        sub.columns = ['session_type', 'labels0']
    else:
        sub[f'labels{i}'] = pd.read_csv(path).sort_values('session_type').reset_index(drop=True)['labels']

In [8]:
def cust_blend(dt, W = [1,1,1,1], base= 3):   
    REC = []
    for i in range(len(W)):
        REC.append(dt[f'labels{i}'].split())

    res = {}
    for M in range(len(REC)):
        for n, v in enumerate(REC[M]):
            if v in res:
                res[v] += (W[M]/(n+base))
            else:
                res[v] = (W[M]/(n+base))
    
    res = list(dict(sorted(res.items(), key=lambda item: -item[1])).keys())
    
    return ' '.join(res[:20])

sub['labels'] = sub.apply(cust_blend, W = weights, axis=1)
sub.head()

Unnamed: 0,session_type,labels0,labels1,labels2,labels3,labels
0,12899779_carts,59625 448688 729915 1103941 578742 737445 8941...,59625 729915 448688 1790770 941596 731692 1340...,59625 448688 731692 1790770 475447 1253524 737...,59625 448688 729915 469285 1103941 894169 6017...,59625 448688 729915 731692 1790770 737445 1103...
1,12899779_clicks,59625 737445 448688 894169 1103941 729915 1660...,59625 737445 448688 1340695 1790770 941596 894...,59625 1253524 737445 448688 731692 1340695 179...,59625 448688 894169 729915 1103941 601769 7374...,59625 737445 448688 894169 1340695 1253524 110...
2,12899779_orders,59625 448688 729915 1103941 578742 894169 4692...,59625 448688 1790770 731692 729915 601769 9415...,59625 731692 448688 1790770 475447 1103941 125...,59625 448688 729915 1103941 469285 894169 6017...,59625 448688 731692 1103941 1790770 729915 894...
3,12899780_carts,1142000 582732 736515 973453 1360606 889686 48...,1142000 487136 582732 973453 760500 736515 595...,1142000 736515 582732 973453 487136 1758603 15...,1142000 736515 582732 973453 1360606 889686 15...,1142000 582732 736515 973453 487136 1758603 88...
4,12899780_clicks,1142000 736515 582732 973453 889686 1360606 15...,1142000 736515 582732 487136 1502122 889686 97...,1142000 736515 582732 973453 1502122 487136 77...,1142000 736515 582732 973453 889686 1758603 14...,1142000 736515 582732 973453 889686 1502122 48...


In [11]:
sub[["session_type", "labels"]].to_csv(cfg.output_dir + f"{cfg.exp_name}/t88_{cfg.exp_name}_sub.csv", index=False)