In [1]:
import os, sys

import pandas as pd
import polars as pl

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import scipy

import sklearn
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

import dproc, sgml, sgnn, sgpp, sgutil, custpp

print(sys.version)
for i in [pd, pl, mpl, sns, np, scipy, sklearn, lgb, xgb, cb]:
    try:
        print(i.__name__, i.__version__)
    except:
        pass

2025-02-10 05:41:06.514803: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1739166066.526883   44087 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1739166066.530391   44087 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-10 05:41:06.542359: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


3.12.6 (main, Sep 30 2024, 02:19:13) [GCC 9.4.0]
pandas 2.2.3
polars 1.12.0
matplotlib 3.8.4
seaborn 0.13.2
numpy 1.26.4
scipy 1.12.0
sklearn 1.5.2
xgboost 2.1.2
catboost 1.2.5


In [2]:
from itertools import combinations

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, KFold, ShuffleSplit, train_test_split
from sklearn.preprocessing import TargetEncoder
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

In [3]:
sc = sgutil.SGCache('img', 'result')
ss = ShuffleSplit(n_splits = 1, train_size = 0.8, random_state = 123)
kf = KFold(5, random_state = 123, shuffle=True)

files = {
    'train': os.path.join('data', 'train.csv'),
    'train_extra': os.path.join('data', 'training_extra.csv'),
    'test': os.path.join('data', 'test.csv'),
}

t = sc.cache_result(
    'pipeline_2',
    lambda : make_pipeline(
        sgpp.PolarsProcessor(), 
        sgpp.ExprProcessor({
            'Compartments_c' : pl.col('Compartments').cast(pl.String).cast(pl.Categorical)
        }),
        sgpp.PandasCoverter(index_col = 'id'),
        sgpp.ApplyWrapper(
            sgpp.CatArrangerFreq(1, na_value = 'Unknown'),
            ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']
        ), 
        custpp.WeightCapacityProcessor()
    ).fit(files['train']),
    rerun = 1
)
df_train = pd.concat(
    [t.transform(files['train']), t.transform(files['train_extra'])], axis = 0
)
df_test = t.transform(files['test'])

target = 'Price'
X_cat = ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color', 'Compartments_c']
X_num = ['Weight Capacity (kg)']

In [4]:
def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size = validation_fraction)

config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict(df[X]), index = df.index).clip(15, 150),
    'score_func': lambda df, prds: root_mean_squared_error(df[target].sort_index(), prds.sort_index()),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'y': target,
}

cb_adapter = sgml.CBAdapter(cb.CatBoostRegressor)
lr_adapter = sgml.SklearnAdapter(LinearRegression)
lgb_adapter = sgml.LGBMAdapter(lgb.LGBMRegressor)
xgb_adapter = sgml.XGBAdapter(xgb.XGBRegressor)

In [5]:
tgt = TargetEncoder(smooth = 35, random_state = 123)
for train_idx, valid_idx in kf.split(df_train, df_train[target]):
    df_cv_train, df_valid = df_train.iloc[train_idx], df_train.iloc[valid_idx]
    bidx = df_valid['Weight Capacity (kg)'].isin(df_cv_train['Weight Capacity (kg)'].unique()) & df_valid['Weight Capacity (kg)'].notna() &\
            df_valid['Weight Capacity (kg)'].between(5, 30, inclusive = 'neither')
    df_valid, df_valid2 = df_valid.loc[bidx], df_valid.loc[~bidx]
    tgt.fit(df_cv_train[['wc_i2']], df_cv_train[target].clip(df_cv_train[target].quantile(0.018), 150))
    print(
        root_mean_squared_error(df_valid[target], tgt.transform(df_valid[['wc_i2']])[:, 0]),
        root_mean_squared_error(df_valid2[target], tgt.transform(df_valid2[['wc_i2']])[:, 0]),
        df_valid2[target].std()
    )

38.629543809953795 39.29101486311971 39.316704
38.64572975855934 39.20963300258326 39.23759
38.681409998918504 39.19875053182947 39.218143
38.64570263604047 39.18592044966053 39.207096
38.651380676657986 39.23936759986119 39.262196


In [6]:
df_train[target].value_counts(normalize = True).iloc[:5]

Price
150.000000    0.018046
15.000000     0.002261
39.834351     0.000049
35.248440     0.000047
77.643898     0.000046
Name: proportion, dtype: float64

The fact that the frequencies of prices at 15 and 150 are high suggests that values have been clipped at 15 or below and at 150 or above. However, since the frequency of values at 15 or below is lower than that of values at 150 or above, the distribution exhibits an asymmetric shape. We examine the effect of adjusting the left-side clipping range of the target to match the extent of the right-side clipping, thereby achieving symmetry.

In [7]:
tgt = TargetEncoder(cv = 5, smooth = 35, target_type='continuous', random_state = 123)
for train_idx, valid_idx in ss.split(df_train, df_train[target]):
    df_cv_train, df_valid = df_train.iloc[train_idx], df_train.iloc[valid_idx]
    df_cv_train = df_cv_train.assign(
        Price = lambda x: x[target].clip(df_cv_train[target].quantile(0.018), 150),
        tgt = lambda x: tgt.fit_transform(x[['wc_i2']], df_cv_train[target])[:, 0]
    )
    df_valid = df_valid
    bidx = df_valid['Weight Capacity (kg)'].isin(df_cv_train['Weight Capacity (kg)'].unique()) & df_valid['Weight Capacity (kg)'].notna() &\
            df_valid['Weight Capacity (kg)'].between(5, 30, inclusive = 'neither')
    df_valid1, df_valid2 = df_valid.loc[bidx], df_valid.loc[~bidx]
    df_valid1 = df_valid1.assign(
        tgt = lambda x: tgt.transform(x[['wc_i2']])[:, 0]
    )
    reg_cb = sc.cache_result(
        'cb_ss_2',
        lambda : sgml.train(df_cv_train, {
                'model_params' : {'n_estimators': 500, 'learning_rate': 0.1},
                'X_num': X_num, 'X_cat': X_cat,
                #'validation_fraction': 0.1
            }, config, cb_adapter, task_type = 'GPU'), rerun = 0
    )
    reg_cb2 = sc.cache_result(
        'cb2_ss_2',
        lambda : sgml.train(df_cv_train, {
                'model_params' : {'n_estimators': 500, 'learning_rate': 0.1},
                'X_cat': X_cat, 'X_num': ['tgt'],
                # 'validation_fraction': 0.1
            }, config, cb_adapter, task_type = 'GPU'), rerun = 0
    )

    s_merge = pd.concat([
        pd.Series(
            make_pipeline(reg_cb2[0]['preprocessor'], reg_cb2[0]['model']).predict(df_valid1[reg_cb2[1]]), index = df_valid1.index
        ),
        pd.Series(
            make_pipeline(reg_cb[0]['preprocessor'], reg_cb[0]['model']).predict(df_valid2[reg_cb[1]]), index = df_valid2.index
        )
    ], axis = 0)
    print(
        root_mean_squared_error(df_valid[target].sort_index(), s_merge.sort_index()),
        root_mean_squared_error(
            df_valid1[target], make_pipeline(reg_cb2[0]['preprocessor'], reg_cb2[0]['model']).predict(df_valid1[reg_cb2[1]])
        ),
        root_mean_squared_error(df_cv_train[target], df_cv_train['tgt']),
        root_mean_squared_error(df_valid1[target], df_valid1['tgt']),
        root_mean_squared_error(
            df_valid2[target], make_pipeline(reg_cb[0]['preprocessor'], reg_cb[0]['model']).predict(df_valid2[reg_cb[1]])
        ), df_valid2[target].std(), bidx.mean()
    )


38.70549105651613 38.59238055257855 38.73538496482012 38.62967410652954 39.222756662786296 39.316704 0.8217593983456508


In [8]:
reg_cb = sc.cache_result(
    'cb_2',
    lambda : sgml.train(df_train.assign(Price = lambda x: x['Price'].clip(x['Price'].quantile(0.018), 150)), {
            'model_params' : {'n_estimators': 500, 'learning_rate': 0.1},
            'X_num': X_num, 'X_cat': X_cat,
            #'validation_fraction': 0.1
        }, config, cb_adapter, task_type = 'GPU')
)
reg_cb2 = sc.cache_result(
    'cb2_2',
    lambda : sgml.train(df_train.assign(Price = lambda x: x['Price'].clip(x['Price'].quantile(0.018), 150)), {
            'model_params' : {'n_estimators': 500, 'learning_rate': 0.1},
            'X_cat': X_cat, 'X_tgt': ['Weight Capacity (kg)'], 'tgt': {'cv': 5, 'smooth': 35, 'random_state': 123}
            #'validation_fraction': 0.1
        }, config, cb_adapter, task_type = 'GPU')
)

In [9]:
bidx = df_test['Weight Capacity (kg)'].isin(df_train['Weight Capacity (kg)'].unique()) & df_test['Weight Capacity (kg)'].notna() &\
            df_test['Weight Capacity (kg)'].between(5, 30, inclusive = 'neither')
df_test1, df_test2 = df_test.loc[bidx], df_test.loc[~bidx]
pd.concat([
    pd.Series(
        make_pipeline(reg_cb2[0]['preprocessor'], reg_cb2[0]['model']).predict(df_test1[reg_cb2[1]]), index = df_test1.index
    ),
    pd.Series(
        make_pipeline(reg_cb[0]['preprocessor'], reg_cb[0]['model']).predict(df_test2[reg_cb[1]]), index = df_test2.index
    )
], axis = 0).rename(target).sort_index().to_frame().to_csv(os.path.join('result', 'submission4.csv'))

In [10]:
# LB: 38.91003
#!kaggle competitions submit -c playground-series-s5e2 -f result/submission4.csv -m "4"

In [11]:
tgt = TargetEncoder(cv = 5, smooth = 35, target_type='continuous', random_state = 123)
for train_idx, valid_idx in ss.split(df_train, df_train[target]):
    df_cv_train, df_valid = df_train.iloc[train_idx], df_train.iloc[valid_idx]
    df_cv_train = df_cv_train.assign(
        Price = lambda x: x[target].clip(df_cv_train[target].quantile(0.018), 150),
        tgt = lambda x: tgt.fit_transform(x[['wc_i2']], df_cv_train[target])[:, 0]
    )
    df_valid = df_valid.assign(
        tgt = lambda x: tgt.transform(x[['wc_i2']])[:, 0]
    )
    bidx = df_valid['Weight Capacity (kg)'].isin(df_cv_train['Weight Capacity (kg)'].unique()) & df_valid['Weight Capacity (kg)'].notna() &\
            df_valid['Weight Capacity (kg)'].between(5, 30, inclusive = 'neither')
    df_valid1, df_valid2 = df_valid.loc[bidx], df_valid.loc[~bidx]
    reg_cb = sc.cache_result(
        'cb_ss_2',
        lambda : sgml.train(df_cv_train, {
                'model_params' : {'n_estimators': 500, 'learning_rate': 0.1},
                'X_num': X_num, 'X_cat': X_cat,
                #'validation_fraction': 0.1
            }, config, cb_adapter, task_type = 'GPU'), rerun = 0
    )
    reg_cb2 = sc.cache_result(
        'cb2_ss_2',
        lambda : sgml.train(df_cv_train, {
                'model_params' : {'n_estimators': 500, 'learning_rate': 0.1},
                'X_cat': X_cat, 'X_num': ['tgt'],
                # 'validation_fraction': 0.1
            }, config, cb_adapter, task_type = 'GPU'), rerun = 0
    )

    s_merge = pd.concat([
        pd.Series(
            make_pipeline(reg_cb2[0]['preprocessor'], reg_cb2[0]['model']).predict(df_valid1[reg_cb2[1]]), index = df_valid1.index
        ),
        pd.Series(
            make_pipeline(reg_cb[0]['preprocessor'], reg_cb[0]['model']).predict(df_valid2[reg_cb[1]]), index = df_valid2.index
        )
    ], axis = 0)
    s_add = (
        pd.Series(
            make_pipeline(reg_cb2[0]['preprocessor'], reg_cb2[0]['model']).predict(df_valid[reg_cb2[1]]), index = df_valid.index
        ) +
        pd.Series(
            make_pipeline(reg_cb[0]['preprocessor'], reg_cb[0]['model']).predict(df_valid[reg_cb[1]]), index = df_valid.index
        )
    ) / 2
    print(
        root_mean_squared_error(df_valid[target].sort_index(), s_merge.sort_index()),
        root_mean_squared_error(df_valid[target].sort_index(), s_add.sort_index()),
        root_mean_squared_error(
            df_valid1[target], make_pipeline(reg_cb2[0]['preprocessor'], reg_cb2[0]['model']).predict(df_valid1[reg_cb2[1]])
        ),
        root_mean_squared_error(df_cv_train[target], df_cv_train['tgt']),
        root_mean_squared_error(df_valid1[target], df_valid1['tgt']),
        root_mean_squared_error(
            df_valid2[target], make_pipeline(reg_cb[0]['preprocessor'], reg_cb[0]['model']).predict(df_valid2[reg_cb[1]])
        ), df_valid2[target].std(), bidx.mean()
    )


38.70549105651613 38.74319418961613 38.59238055257855 38.73538496482012 38.62967410652954 39.222756662786296 39.316704 0.8217593983456508


We used different models depending on whether Weight Capacity appeared only in the test set or if it had been clipped.

This approach proved to be highly effective. Now, we aim to capture this characteristic using a single model.

To achieve this, we define a variable for target encoding:
Among the Weight Capacity (kg) values in the training set, those that appear in the test set—excluding 5 and 30, which resulted from clipping—are assigned specific values to apply the effect of target encoding.
For all other cases, the values are set to 0, ensuring that target encoding is applied accordingly.

We then verify whether the intended effect is achieved using a single model.

In [12]:
tgt = TargetEncoder(cv = 5, smooth = 35, target_type='continuous', random_state = 123)
for train_idx, valid_idx in ss.split(df_train, df_train[target]):
    df_cv_train, df_valid = df_train.iloc[train_idx], df_train.iloc[valid_idx]
    df_cv_train = df_cv_train.assign(
        wc_i3 = lambda x: x['Weight Capacity (kg)'] * x['Weight Capacity (kg)'].isin(
            df_valid['Weight Capacity (kg)'].pipe(lambda x: x.loc[x.between(5, 30, inclusive = 'neither')]).unique()
        )
    )
    df_valid = df_valid.assign(
        wc_i3 = lambda x: x['Weight Capacity (kg)'] * x['Weight Capacity (kg)'].isin(
            df_cv_train['Weight Capacity (kg)'].pipe(lambda x: x.loc[x.between(5, 30, inclusive = 'neither')]).unique()
        )
    )
    reg_cb = sc.cache_result(
        'cb_ss_3',
        lambda : sgml.train(df_cv_train, {
                'model_params' : {'n_estimators': 500, 'learning_rate': 0.1},
                'X_num': X_num, 'X_cat': X_cat, 'X_tgt': ['wc_i3'], 'tgt': {'cv': 5, 'smooth': 35, 'random_state': 123}
                #'validation_fraction': 0.1
            }, config, cb_adapter, task_type = 'GPU'), rerun = 1
    )
    print(
        root_mean_squared_error(df_valid[target],
            make_pipeline(reg_cb[0]['preprocessor'], reg_cb[0]['model']).predict(df_valid[reg_cb[1]])
        )
    )

38.69653651784812


The effectiveness is observed. We examine its effectiveness by making a submission.

In [123]:
(df_cv_train['wc_i3'] == 0).mean(), (df_valid['wc_i3'] == 0).mean()

(0.03259083314850043, 0.2112469116291733)

# Submission 5

In [61]:
df_train = df_cv_train.assign(
    wc_i3 = lambda x: x['Weight Capacity (kg)'] * x['Weight Capacity (kg)'].isin(
        df_test['Weight Capacity (kg)'].pipe(lambda x: x.loc[x.between(5, 30, inclusive = 'neither')]).unique()
    )
)
df_test = df_test.assign(
    wc_i3 = lambda x: x['Weight Capacity (kg)'] * x['Weight Capacity (kg)'].isin(
        df_train['Weight Capacity (kg)'].pipe(lambda x: x.loc[x.between(5, 30, inclusive = 'neither')]).unique()
    )
)

In [62]:
reg_cb = sc.cache_result(
    'cb_3',
    lambda : sgml.train(df_train, {
            'model_params' : {'n_estimators': 500, 'learning_rate': 0.1},
            'X_num': X_num, 'X_cat': X_cat, 'X_tgt': ['wc_i3'], 'tgt': {'cv': 5, 'smooth': 35, 'random_state': 123}
            #'validation_fraction': 0.1
        }, config, cb_adapter, task_type = 'GPU'), rerun = 0
)

In [63]:
pd.Series(
    make_pipeline(reg_cb[0]['preprocessor'], reg_cb[0]['model']).predict(df_test[reg_cb[1]]), index = df_test.index, name = target
).sort_index().to_frame().to_csv(os.path.join('result', 'submission5.csv'))

In [None]:
# LB: 38.92596
#!kaggle competitions submit -c playground-series-s5e2 -f result/submission5.csv -m "5"

In [68]:
(df_train['wc_i3'] == 0).mean(), (df_test['wc_i3'] == 0).mean()

(0.41969873451471995, 0.17584)

On the leaderboard (LB), it actually showed a negative effect. Since the pattern of Weight Capacity differs from that during validation, the expected results were not achieved.

In [15]:
tgt = TargetEncoder(cv = 5, smooth = 35, target_type='continuous', random_state = 123)
for train_idx, valid_idx in ss.split(df_train, df_train[target]):
    df_cv_train, df_valid = df_train.iloc[train_idx], df_train.iloc[valid_idx]
    df_cv_train = df_cv_train.assign(tgt = lambda x: tgt.fit_transform(x[['wc_i2']], df_cv_train[target])[:, 0])
    df_valid = df_valid.assign(tgt = lambda x: tgt.transform(x[['wc_i2']])[:, 0])
    bidx = df_valid['Weight Capacity (kg)'].isin(df_cv_train['Weight Capacity (kg)'].unique()) & df_valid['Weight Capacity (kg)'].notna() &\
            df_valid['Weight Capacity (kg)'].between(5, 30, inclusive = 'neither')
    df_valid1, df_valid2 = df_valid.loc[bidx], df_valid.loc[~bidx]
    reg_cb = sc.cache_result(
        'cb_ss_4',
        lambda : sgml.train(df_cv_train, {
                'model_params' : {'n_estimators': 500, 'learning_rate': 0.1},
                'X_num': X_num, 'X_cat': X_cat,
                #'validation_fraction': 0.1
            }, config, cb_adapter, task_type = 'GPU'), rerun = 1
    )
    reg_cb2 = sc.cache_result(
        'cb2_ss_4',
        lambda : sgml.train(df_cv_train, {
                'model_params' : {'n_estimators': 500, 'learning_rate': 0.1},
                'X_cat': X_cat, 'X_num': ['tgt', 'Weight Capacity (kg)'],
                # 'validation_fraction': 0.1
            }, config, cb_adapter, task_type = 'GPU'), rerun = 1
    )

    s_merge = pd.concat([
        pd.Series(
            make_pipeline(reg_cb2[0]['preprocessor'], reg_cb2[0]['model']).predict(df_valid1[reg_cb2[1]]), index = df_valid1.index
        ),
        pd.Series(
            make_pipeline(reg_cb[0]['preprocessor'], reg_cb[0]['model']).predict(df_valid2[reg_cb[1]]), index = df_valid2.index
        )
    ], axis = 0)
    s_add = (
        pd.Series(
            make_pipeline(reg_cb2[0]['preprocessor'], reg_cb2[0]['model']).predict(df_valid[reg_cb2[1]]), index = df_valid.index
        ) +
        pd.Series(
            make_pipeline(reg_cb[0]['preprocessor'], reg_cb[0]['model']).predict(df_valid[reg_cb[1]]), index = df_valid.index
        )
    ) / 2
    print(
        root_mean_squared_error(df_valid[target].sort_index(), s_merge.sort_index()),
        root_mean_squared_error(df_valid[target].sort_index(), s_add.sort_index()),
        root_mean_squared_error(
            df_valid1[target], make_pipeline(reg_cb2[0]['preprocessor'], reg_cb2[0]['model']).predict(df_valid1[reg_cb2[1]])
        ),
        root_mean_squared_error(df_cv_train[target], df_cv_train['tgt']),
        root_mean_squared_error(df_valid1[target], df_valid1['tgt']),
        root_mean_squared_error(
            df_valid2[target], make_pipeline(reg_cb[0]['preprocessor'], reg_cb[0]['model']).predict(df_valid2[reg_cb[1]])
        ), df_valid2[target].std(), bidx.mean()
    )

38.696492389789974 38.74008770455407 38.58149839302241 38.772034610161064 38.62967410652954 39.222299940278624 39.316704 0.8217593983456508
