In [2]:
import os, sys

import pandas as pd
import polars as pl

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import scipy

import sklearn
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

import dproc, sgml, sgnn, sgpp, sgutil, custpp

print(sys.version)
for i in [pd, pl, mpl, sns, np, scipy, sklearn, lgb, xgb, cb]:
    try:
        print(i.__name__, i.__version__)
    except:
        pass

2025-02-07 09:46:41.475871: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1738921601.487647    2918 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1738921601.491290    2918 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-07 09:46:41.503304: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


3.12.6 (main, Sep 30 2024, 02:19:13) [GCC 9.4.0]
pandas 2.2.3
polars 1.12.0
matplotlib 3.8.4
seaborn 0.13.2
numpy 1.26.4
scipy 1.12.0
sklearn 1.5.2
xgboost 2.1.2
catboost 1.2.5


In [3]:
from itertools import combinations

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, KFold, ShuffleSplit, train_test_split
from sklearn.preprocessing import TargetEncoder
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

In [4]:
sc = sgutil.SGCache('img', 'result')
ss = ShuffleSplit(n_splits = 1, train_size = 0.8, random_state = 123)
kf = KFold(5, random_state = 123, shuffle=True)

files = {
    'train': os.path.join('data', 'train.csv'),
    'train_extra': os.path.join('data', 'training_extra.csv'),
    'test': os.path.join('data', 'test.csv'),
}

t = sc.cache_result(
    'pipeline_2',
    lambda : make_pipeline(
        sgpp.PolarsProcessor(), 
        sgpp.ExprProcessor({
            'Compartments_c' : pl.col('Compartments').cast(pl.String).cast(pl.Categorical)
        }),
        sgpp.PandasCoverter(index_col = 'id'),
        sgpp.ApplyWrapper(
            sgpp.CatArrangerFreq(1, na_value = 'Unknown'),
            ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']
        ), 
        custpp.WeightCapacityProcessor()
    ).fit(files['train']),
    rerun = 1
)
df_train = pd.concat(
    [t.transform(files['train']), t.transform(files['train_extra'])], axis = 0
)
df_test = t.transform(files['test'])

target = 'Price'
X_cat = ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color', 'Compartments_c']
X_num = ['Weight Capacity (kg)']

In [5]:
def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size = validation_fraction)

config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict(df[X]), index = df.index).clip(15, 150),
    'score_func': lambda df, prds: root_mean_squared_error(df[target].sort_index(), prds.sort_index()),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'y': target,
}

cb_adapter = sgml.CBAdapter(cb.CatBoostRegressor)
lr_adapter = sgml.SklearnAdapter(LinearRegression)
lgb_adapter = sgml.LGBMAdapter(lgb.LGBMRegressor)
xgb_adapter = sgml.XGBAdapter(xgb.XGBRegressor)

In [6]:
def rmse_arr_freq_nearest():
    X = ['Weight Capacity (kg)']
    tgt = TargetEncoder()
    tgt_list = list()
    cafn = sgpp.CatArrangerFreqNearest(30, df_train['Weight Capacity (kg)'].mode()[0])
    cafn.fit(df_train[X].astype('category'))
    for train_idx, valid_idx in kf.split(df_train[X], df_train[target]):
        df_cv_train, df_valid = df_train.iloc[train_idx], df_train.iloc[valid_idx]
        tgt.fit(cafn.transform(df_cv_train[X].astype('category')) , df_cv_train[target])
        tgt_list.append(
            pd.Series(tgt.transform(
                cafn.transform(df_valid[X].astype('category'))
            )[:, 0], index = df_valid.index, name = '_'.join(X))
        )
    prd = pd.concat(tgt_list)
    root_mean_squared_error(
        df_train[target], prd.sort_index()
    )
sc.cache_result(
    'rmse_arr_freq_nearest',
    rmse_arr_freq_nearest
)

In [7]:
df_test['Weight Capacity (kg)'].isin(df_train['Weight Capacity (kg)'].unique()).mean()

0.856175

In [9]:
for train_idx, valid_idx in kf.split(df_train, df_train[target]):
    df_cv_train, df_valid = df_train.iloc[train_idx], df_train.iloc[valid_idx]
    print(
        df_valid['Weight Capacity (kg)'].isin(df_cv_train['Weight Capacity (kg)'].unique()).mean()
    )

0.8373753229586012
0.8379173426265297
0.8382428047827916
0.8369632840674809
0.8375240811002638


In [10]:
tgt = TargetEncoder(smooth = 40, random_state = 123)
for train_idx, valid_idx in kf.split(df_train, df_train[target]):
    df_cv_train, df_valid = df_train.iloc[train_idx], df_train.iloc[valid_idx]
    bidx = df_valid['Weight Capacity (kg)'].isin(df_cv_train['Weight Capacity (kg)'].unique()) & df_valid['Weight Capacity (kg)'].notna() &\
            df_valid['Weight Capacity (kg)'].between(5, 30, inclusive = 'neither')
    df_valid, df_valid2 = df_valid.loc[bidx], df_valid.loc[~bidx]
    tgt.fit(df_cv_train[['wc_i2']], df_cv_train[target])
    print(
        root_mean_squared_error(df_valid[target], tgt.transform(df_valid[['wc_i2']])[:, 0]),
        root_mean_squared_error(df_valid2[target], tgt.transform(df_valid2[['wc_i2']])[:, 0]),
        df_valid2[target].std()
    )

38.63043598905003 39.29107587483704 39.316704
38.64637651763216 39.209742728006184 39.23759
38.682307868237835 39.19897770956672 39.218143
38.64663618232884 39.185949273047996 39.207096
38.65162401978249 39.239394528912044 39.262196


In [12]:
hparams = {
    'model_params' : {'n_estimators': 500, 'learning_rate': 0.1},
    'X_num': X_num, 'X_cat': X_cat,
    #'validation_fraction': 0.1
}
tgt = TargetEncoder(smooth = 40, random_state = 123)
for train_idx, valid_idx in ss.split(df_train, df_train[target]):
    df_cv_train, df_valid = df_train.iloc[train_idx], df_train.iloc[valid_idx]
    bidx = df_valid['Weight Capacity (kg)'].isin(df_cv_train['Weight Capacity (kg)'].unique()) & df_valid['Weight Capacity (kg)'].notna() &\
            df_valid['Weight Capacity (kg)'].between(5, 30, inclusive = 'neither')
    df_valid1, df_valid2 = df_valid.loc[bidx], df_valid.loc[~bidx]
    reg_cb = sc.cache_result(
        'cb_ss',
        lambda : sgml.train(df_cv_train, hparams, config, cb_adapter, task_type = 'GPU'), rerun = 0
    )
    tgt.fit(df_cv_train[['wc_i2']], df_cv_train[target])
    print(
        root_mean_squared_error(df_valid1[target], tgt.transform(df_valid1[['wc_i2']])[:, 0]),
        root_mean_squared_error(df_valid2[target], tgt.transform(df_valid2[['wc_i2']])[:, 0]),
        root_mean_squared_error(
            df_valid2[target], make_pipeline(reg_cb[0]['preprocessor'], reg_cb[0]['model']).predict(df_valid2[reg_cb[1]])
        ), df_valid2[target].std(), bidx.mean()
    )


38.6304360034108 39.29107590447395 39.22264989813489 39.316704 0.8217593983456508


In [16]:
for train_idx, valid_idx in ss.split(df_train, df_train[target]):
    df_cv_train, df_valid = df_train.iloc[train_idx], df_train.iloc[valid_idx]
    bidx = df_valid['Weight Capacity (kg)'].isin(df_cv_train['Weight Capacity (kg)'].unique()) & df_valid['Weight Capacity (kg)'].notna() &\
            df_valid['Weight Capacity (kg)'].between(5, 30, inclusive = 'neither')
    df_valid1, df_valid2 = df_valid.loc[bidx], df_valid.loc[~bidx]
    reg_cb = sc.cache_result(
        'cb_ss',
        lambda : sgml.train(df_cv_train, {
                'model_params' : {'n_estimators': 500, 'learning_rate': 0.1},
                'X_num': X_num, 'X_cat': X_cat,
                #'validation_fraction': 0.1
            }, config, cb_adapter, task_type = 'GPU'), rerun = 0
    )
    reg_cb2 = sc.cache_result(
        'cb2_ss',
        lambda : sgml.train(df_cv_train, {
                'model_params' : {'n_estimators': 500, 'learning_rate': 0.1},
                'X_cat': X_cat, 'X_tgt': ['Weight Capacity (kg)'], 'tgt': {'target_type': 'continuous', 'smooth': 40}
                #'validation_fraction': 0.1
            }, config, cb_adapter, task_type = 'GPU'), rerun = 0
    )

    s_merge = pd.concat([
        pd.Series(
            make_pipeline(reg_cb2[0]['preprocessor'], reg_cb2[0]['model']).predict(df_valid1[reg_cb2[1]]), index = df_valid1.index
        ),
        pd.Series(
            make_pipeline(reg_cb[0]['preprocessor'], reg_cb[0]['model']).predict(df_valid2[reg_cb[1]]), index = df_valid2.index
        )
    ], axis = 0)
    print(
        root_mean_squared_error(df_valid[target].sort_index(), s_merge.sort_index()),
        root_mean_squared_error(
            df_valid1[target], make_pipeline(reg_cb2[0]['preprocessor'], reg_cb2[0]['model']).predict(df_valid1[reg_cb2[1]])
        ),
        root_mean_squared_error(
            df_valid2[target], make_pipeline(reg_cb[0]['preprocessor'], reg_cb[0]['model']).predict(df_valid2[reg_cb[1]])
        ), df_valid2[target].std(), bidx.mean()
    )


38.70587429517391 38.59287181782381 39.22264989813489 39.316704 0.8217593983456508


In [17]:
root_mean_squared_error(
    df_cv_train[target], make_pipeline(reg_cb2[0]['preprocessor'], reg_cb2[0]['model']).predict(df_cv_train[reg_cb2[1]])
)

38.21544175895192

In [54]:
reg_cb = sc.cache_result(
    'cb',
    lambda : sgml.train(df_train, {
            'model_params' : {'n_estimators': 500, 'learning_rate': 0.1},
            'X_num': X_num, 'X_cat': X_cat,
            #'validation_fraction': 0.1
        }, config, cb_adapter, task_type = 'GPU')
)
reg_cb2 = sc.cache_result(
    'cb2',
    lambda : sgml.train(df_train, {
            'model_params' : {'n_estimators': 500, 'learning_rate': 0.1},
            'X_cat': X_cat, 'X_tgt': ['Weight Capacity (kg)'], 'tgt': {'target_type': 'continuous', 'smooth': 40}
            #'validation_fraction': 0.1
        }, config, cb_adapter, task_type = 'GPU')
)

In [55]:
bidx = df_test['Weight Capacity (kg)'].isin(df_train['Weight Capacity (kg)'].unique()) & df_test['Weight Capacity (kg)'].notna() &\
            df_test['Weight Capacity (kg)'].between(5, 30, inclusive = 'neither')
df_test1, df_test2 = df_test.loc[bidx], df_test.loc[~bidx]
pd.concat([
    pd.Series(
        make_pipeline(reg_cb2[0]['preprocessor'], reg_cb2[0]['model']).predict(df_test1[reg_cb2[1]]), index = df_test1.index
    ),
    pd.Series(
        make_pipeline(reg_cb[0]['preprocessor'], reg_cb[0]['model']).predict(df_test2[reg_cb[1]]), index = df_test2.index
    )
], axis = 0).rename(target).sort_index().to_frame().to_csv(os.path.join('result', 'submission1.csv'))

# Submission 1

In [58]:
# LB 38.907
#!kaggle competitions submit -c playground-series-s5e2 -f result/submission1.csv -m "1"

100%|██████████████████████████████████████| 4.74M/4.74M [00:02<00:00, 1.98MB/s]
Successfully submitted to Backpack Prediction Challenge

In [59]:
tgt = TargetEncoder(smooth = 40, random_state = 123)
tgt.fit(df_cv_train[['wc_i2']], df_cv_train[target])

# Submission 2

In [61]:
pd.concat([
    pd.Series(
        tgt.transform(df_test1[['wc_i2']])[:, 0], index = df_test1.index
    ),
    pd.Series(
        make_pipeline(reg_cb[0]['preprocessor'], reg_cb[0]['model']).predict(df_test2[reg_cb[1]]), index = df_test2.index
    )
], axis = 0).rename(target).sort_index().to_frame().to_csv(os.path.join('result', 'submission2.csv'))

In [62]:
# LB: 38.955
#!kaggle competitions submit -c playground-series-s5e2 -f result/submission2.csv -m "2"

100%|██████████████████████████████████████| 4.74M/4.74M [00:02<00:00, 1.93MB/s]
Successfully submitted to Backpack Prediction Challenge

# Submission 3

In [63]:
pd.Series(
    make_pipeline(reg_cb[0]['preprocessor'], reg_cb[0]['model']).predict(df_test[reg_cb[1]]), index = df_test.index
).rename(target).sort_index().to_frame().to_csv(os.path.join('result', 'submission3.csv'))

In [64]:
# LB: 38.105
#!kaggle competitions submit -c playground-series-s5e2 -f result/submission3.csv -m "3"

100%|██████████████████████████████████████| 4.74M/4.74M [00:02<00:00, 1.70MB/s]
Successfully submitted to Backpack Prediction Challenge