In [1]:
import os, sys

import pandas as pd
import polars as pl

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import scipy

import sklearn
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

import dproc, sgml, sgnn, sgpp, sgutil, custpp

print(sys.version)
for i in [pd, pl, mpl, sns, np, scipy, sklearn, lgb, xgb, cb]:
    try:
        print(i.__name__, i.__version__)
    except:
        pass

2025-02-09 00:26:26.623273: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-09 00:26:26.901948: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


3.12.3 (main, May  1 2024, 17:33:23) [GCC 11.4.0]
pandas 2.2.2
polars 1.12.0
matplotlib 3.8.4
seaborn 0.13.2
numpy 1.26.4
scipy 1.13.0
sklearn 1.4.2
lightgbm 4.3.0
xgboost 2.1.2
catboost 1.2.5


In [2]:
from itertools import combinations

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, KFold, ShuffleSplit, train_test_split
from sklearn.preprocessing import TargetEncoder
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

In [3]:
sc = sgutil.SGCache('img', 'result')
ss = ShuffleSplit(n_splits = 1, train_size = 0.8, random_state = 123)
kf = KFold(5, random_state = 123, shuffle=True)

files = {
    'train': os.path.join('data', 'train.csv'),
    'train_extra': os.path.join('data', 'training_extra.csv'),
    'test': os.path.join('data', 'test.csv'),
}

t = sc.cache_result(
    'pipeline_2',
    lambda : make_pipeline(
        sgpp.PolarsProcessor(), 
        sgpp.ExprProcessor({
            'Compartments_c' : pl.col('Compartments').cast(pl.String).cast(pl.Categorical)
        }),
        sgpp.PandasCoverter(index_col = 'id'),
        sgpp.ApplyWrapper(
            sgpp.CatArrangerFreq(1, na_value = 'Unknown'),
            ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']
        ), 
        custpp.WeightCapacityProcessor()
    ).fit(files['train']),
    rerun = 1
)
df_train = pd.concat(
    [t.transform(files['train']), t.transform(files['train_extra'])], axis = 0
)
df_test = t.transform(files['test'])

target = 'Price'
X_cat = ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color', 'Compartments_c']
X_num = ['Weight Capacity (kg)']

In [4]:
def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size = validation_fraction)

config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict(df[X]), index = df.index).clip(15, 150),
    'score_func': lambda df, prds: root_mean_squared_error(df[target].sort_index(), prds.sort_index()),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'y': target,
}

cb_adapter = sgml.CBAdapter(cb.CatBoostRegressor)
lr_adapter = sgml.SklearnAdapter(LinearRegression)
lgb_adapter = sgml.LGBMAdapter(lgb.LGBMRegressor)
xgb_adapter = sgml.XGBAdapter(xgb.XGBRegressor)

In [5]:
tgt = TargetEncoder(smooth = 35, random_state = 123)
for train_idx, valid_idx in kf.split(df_train, df_train[target]):
    df_cv_train, df_valid = df_train.iloc[train_idx], df_train.iloc[valid_idx]
    bidx = df_valid['Weight Capacity (kg)'].isin(df_cv_train['Weight Capacity (kg)'].unique()) & df_valid['Weight Capacity (kg)'].notna() &\
            df_valid['Weight Capacity (kg)'].between(5, 30, inclusive = 'neither')
    df_valid, df_valid2 = df_valid.loc[bidx], df_valid.loc[~bidx]
    tgt.fit(df_cv_train[['wc_i2']], df_cv_train[target].clip(df_cv_train[target].quantile(0.018), 150))
    print(
        root_mean_squared_error(df_valid[target], tgt.transform(df_valid[['wc_i2']])[:, 0]),
        root_mean_squared_error(df_valid2[target], tgt.transform(df_valid2[['wc_i2']])[:, 0]),
        df_valid2[target].std()
    )

38.629543809953795 39.29101486311971 39.316704
38.64572975855934 39.20963300258326 39.23759
38.681409998918504 39.19875053182947 39.218143
38.64570263604047 39.18592044966053 39.207096
38.651380676657986 39.23936759986119 39.262196


In [9]:
df_cv_train[target].value_counts().iloc[:5]

Price
150.000000    57556
15.000000      7211
39.834351       157
27.485460       154
38.375099       154
Name: count, dtype: int64

In [14]:
tgt = TargetEncoder(cv = 5, smooth = 35, target_type='continuous', random_state = 123)
for train_idx, valid_idx in ss.split(df_train, df_train[target]):
    df_cv_train, df_valid = df_train.iloc[train_idx], df_train.iloc[valid_idx]
    df_cv_train = df_cv_train.assign(
        Price = lambda x: x[target].clip(df_cv_train[target].quantile(0.018), 150),
        tgt = lambda x: tgt.fit_transform(x[['wc_i2']], df_cv_train[target])[:, 0]
    )
    df_valid = df_valid
    bidx = df_valid['Weight Capacity (kg)'].isin(df_cv_train['Weight Capacity (kg)'].unique()) & df_valid['Weight Capacity (kg)'].notna() &\
            df_valid['Weight Capacity (kg)'].between(5, 30, inclusive = 'neither')
    df_valid1, df_valid2 = df_valid.loc[bidx], df_valid.loc[~bidx]
    df_valid1 = df_valid1.assign(
        tgt = lambda x: tgt.transform(x[['wc_i2']])[:, 0]
    )
    reg_cb = sc.cache_result(
        'cb_ss_2',
        lambda : sgml.train(df_cv_train, {
                'model_params' : {'n_estimators': 500, 'learning_rate': 0.1},
                'X_num': X_num, 'X_cat': X_cat,
                #'validation_fraction': 0.1
            }, config, cb_adapter, task_type = 'GPU'), rerun = 0
    )
    reg_cb2 = sc.cache_result(
        'cb2_ss_2',
        lambda : sgml.train(df_cv_train, {
                'model_params' : {'n_estimators': 500, 'learning_rate': 0.1},
                'X_cat': X_cat, 'X_num': ['tgt'],
                # 'validation_fraction': 0.1
            }, config, cb_adapter, task_type = 'GPU'), rerun = 1
    )

    s_merge = pd.concat([
        pd.Series(
            make_pipeline(reg_cb2[0]['preprocessor'], reg_cb2[0]['model']).predict(df_valid1[reg_cb2[1]]), index = df_valid1.index
        ),
        pd.Series(
            make_pipeline(reg_cb[0]['preprocessor'], reg_cb[0]['model']).predict(df_valid2[reg_cb[1]]), index = df_valid2.index
        )
    ], axis = 0)
    print(
        root_mean_squared_error(df_valid[target].sort_index(), s_merge.sort_index()),
        root_mean_squared_error(
            df_valid1[target], make_pipeline(reg_cb2[0]['preprocessor'], reg_cb2[0]['model']).predict(df_valid1[reg_cb2[1]])
        ),
        root_mean_squared_error(df_cv_train[target], df_cv_train['tgt']),
        root_mean_squared_error(df_valid1[target], df_valid1['tgt']),
        root_mean_squared_error(
            df_valid2[target], make_pipeline(reg_cb[0]['preprocessor'], reg_cb[0]['model']).predict(df_valid2[reg_cb[1]])
        ), df_valid2[target].std(), bidx.mean()
    )


38.7053771769684 38.59229795035566 38.73538496482012 38.62967410652954 39.22250088744373 39.316704 0.8217593983456508


In [19]:
reg_cb = sc.cache_result(
    'cb_2',
    lambda : sgml.train(df_train.assign(Price = lambda x: x['Price'].clip(x['Price'].quantile(0.018), 150)), {
            'model_params' : {'n_estimators': 500, 'learning_rate': 0.1},
            'X_num': X_num, 'X_cat': X_cat,
            #'validation_fraction': 0.1
        }, config, cb_adapter, task_type = 'GPU')
)
reg_cb2 = sc.cache_result(
    'cb2_2',
    lambda : sgml.train(df_train.assign(Price = lambda x: x['Price'].clip(x['Price'].quantile(0.018), 150)), {
            'model_params' : {'n_estimators': 500, 'learning_rate': 0.1},
            'X_cat': X_cat, 'X_tgt': ['Weight Capacity (kg)'], 'tgt': {'cv': 5, 'smooth': 35, 'random_state': 123}
            #'validation_fraction': 0.1
        }, config, cb_adapter, task_type = 'GPU')
)

In [20]:
bidx = df_test['Weight Capacity (kg)'].isin(df_train['Weight Capacity (kg)'].unique()) & df_test['Weight Capacity (kg)'].notna() &\
            df_test['Weight Capacity (kg)'].between(5, 30, inclusive = 'neither')
df_test1, df_test2 = df_test.loc[bidx], df_test.loc[~bidx]
pd.concat([
    pd.Series(
        make_pipeline(reg_cb2[0]['preprocessor'], reg_cb2[0]['model']).predict(df_test1[reg_cb2[1]]), index = df_test1.index
    ),
    pd.Series(
        make_pipeline(reg_cb[0]['preprocessor'], reg_cb[0]['model']).predict(df_test2[reg_cb[1]]), index = df_test2.index
    )
], axis = 0).rename(target).sort_index().to_frame().to_csv(os.path.join('result', 'submission4.csv'))

In [22]:
# LB: 38.91003
#!kaggle competitions submit -c playground-series-s5e2 -f result/submission4.csv -m "4"