# cherrypick001

In [1]:
use_exp = {"clicks": "exp053_pl", "carts": "exp053_pl", "orders": "exp059"}
cv_score = {"clicks": 0.507206, "carts": 0.354493, "orders": 0.591674}

In [2]:
cv_score["total"] = format(cv_score["clicks"] * 0.1 + cv_score["carts"] * 0.3 + cv_score["orders"] * 0.6, ".6f")

In [3]:
import os
import sys
import traceback
import gc
import random
import pickle
import pathlib
import subprocess
from dataclasses import dataclass
from dotenv import load_dotenv
load_dotenv
sys.path.append(os.getenv('UTILS_PATH'))
from tqdm import tqdm
import multiprocessing
import inspect

import pandas as pd
import polars as pl
import numpy as np
import itertools
import cudf
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
import line_notify
import my_logger
from noglobal import noglobal

# 設定

In [4]:
@dataclass
class Cfg:
    loglevel = "INFO"
    exp_name = "cherrypick001"
    seed = 42
    k = 20
    cand_n = 15
    negative_sample = 1
    train_chunk_n_dict = {"clicks":2, "carts":1, "orders":1}
    test_chunk_n = 2
    type2id = {"clicks":0, "carts":1, "orders":2}
    id2type = {0:"clicks", 1:"carts", 2:"orders"}
    train_week = "week3"
    valid_week = "week4"
    valid_session_n = 100_000
    input_dir = os.getenv('INPUT_DIR')
    output_dir = os.getenv('OUTPUT_DIR')
    prep_dir = os.getenv("PREP_DIR")

    clicks_params = {'objective': 'binary', 'boosting': 'gbdt', 'learning_rate': 0.1, 'metric': 'binary_logloss', 'seed': 42, 'feature_pre_filter': False, 'lambda_l1': 5.485903737168179, 'lambda_l2': 0.005594683492536064, 'num_leaves': 79, 'feature_fraction': 0.552, 'bagging_fraction': 0.9295272232672004, 'bagging_freq': 2, 'min_child_samples': 10}
    carts_params = {'objective': 'binary', 'boosting': 'gbdt', 'learning_rate': 0.1, 'metric': 'binary_logloss', 'seed': 42, 'feature_pre_filter': False, 'lambda_l1': 8.709050252544463, 'lambda_l2': 0.06935262036337767, 'num_leaves': 252, 'feature_fraction': 0.4, 'bagging_fraction': 1.0, 'bagging_freq': 0, 'min_child_samples': 5}
    orders_params = {'objective': 'binary', 'boosting': 'gbdt', 'learning_rate': 0.1, 'metric': 'binary_logloss', 'seed': 42, 'feature_pre_filter': False, 'lambda_l1': 9.356310279757256, 'lambda_l2': 1.3120983078968551e-08, 'num_leaves': 174, 'feature_fraction': 0.5, 'bagging_fraction': 1.0, 'bagging_freq': 0, 'min_child_samples': 20}

cfg = Cfg()
os.makedirs(os.path.join(cfg.output_dir, cfg.exp_name), exist_ok=True)
os.makedirs(os.path.join(cfg.output_dir, cfg.exp_name, "cache"), exist_ok=True)
random.seed(cfg.seed)

logger = my_logger.init_logger(cfg.exp_name)

In [5]:


# sub作成
sub = pd.read_csv(cfg.input_dir + f"sample_submission.csv", usecols=["session_type"])

dfs = []
for type_ in ["clicks", "carts", "orders"]:
    exp = use_exp[type_]
    pred_file_path = cfg.output_dir + f"{exp}/t88_{exp}_sub_k30.csv"
    print(type_, pred_file_path)
    pred = pd.read_csv(pred_file_path)
    pred = pred[pred["session_type"].str.contains(type_)].copy()
    dfs.append(pred)
preds = pd.concat(dfs)

sub = sub.merge(preds, on="session_type", how="left")
sub.to_csv(cfg.output_dir + f"{cfg.exp_name}/t88_{cfg.exp_name}_sub_k30.csv", index=False)

clicks /mnt/otto-recommender-system/output/exp053_pl/t88_exp053_pl_sub_k30.csv
carts /mnt/otto-recommender-system/output/exp053_pl/t88_exp053_pl_sub_k30.csv
orders /mnt/otto-recommender-system/output/exp059/t88_exp059_sub_k30.csv


In [8]:
sub.head()

Unnamed: 0,session_type,labels
0,12899779_clicks,59625 737445 448688 1340695 941596 601769 1790...
1,12899779_carts,59625 448688 731692 737445 941596 729915 60176...
2,12899779_orders,59625 731692 448688 941596 729915 1790770 1340...
3,12899780_clicks,1142000 736515 582732 487136 889686 1502122 15...
4,12899780_carts,1142000 582732 736515 973453 1502122 487136 59...


In [7]:
print("use_exp : ", use_exp)
print("cv_score : ", cv_score)

use_exp :  {'clicks': 'exp053_pl', 'carts': 'exp053_pl', 'orders': 'exp059'}
cv_score :  {'clicks': 0.507206, 'carts': 0.354493, 'orders': 0.591674, 'total': '0.512073'}
