In [1]:
import os, sys

import pandas as pd
import polars as pl

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import scipy

import sklearn
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

import dproc, sgml, sgnn, sgpp, sgutil, custpp

print(sys.version)
for i in [pd, pl, mpl, sns, np, scipy, sklearn, lgb, xgb, cb]:
    try:
        print(i.__name__, i.__version__)
    except:
        pass

2025-02-06 07:06:41.590240: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1738825601.716419   54298 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1738825601.752614   54298 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-06 07:06:42.088142: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


3.12.6 (main, Sep 30 2024, 02:19:13) [GCC 9.4.0]
pandas 2.2.3
polars 1.12.0
matplotlib 3.8.4
seaborn 0.13.2
numpy 1.26.4
scipy 1.12.0
sklearn 1.5.2
xgboost 2.1.2
catboost 1.2.5


In [55]:
from itertools import combinations

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, KFold, ShuffleSplit, train_test_split
from sklearn.preprocessing import TargetEncoder
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

In [3]:
sc = sgutil.SGCache('img', 'result')
ss = ShuffleSplit(n_splits = 1, random_state = 123)
kf = KFold(5, random_state = 123, shuffle=True)

files = {
    'train': os.path.join('data', 'train.csv'),
    'train_extra': os.path.join('data', 'training_extra.csv'),
    'test': os.path.join('data', 'test.csv'),
}

In [4]:
t = sc.cache_result(
    'pipeline_2',
    lambda : make_pipeline(
        sgpp.PolarsProcessor(), 
        sgpp.ExprProcessor({
            'Compartments_c' : pl.col('Compartments').cast(pl.String).cast(pl.Categorical)
        }),
        sgpp.PandasCoverter(index_col = 'id'),
        sgpp.ApplyWrapper(
            sgpp.CatArrangerFreq(1, na_value = 'Unknown'),
            ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']
        ), 
        custpp.WeightCapacityProcessor()
    ).fit(files['train']),
    rerun = 1
)
df_train = pd.concat(
    [t.transform(files['train']), t.transform(files['train_extra'])], axis = 0
)
df_test = t.transform(files['test'])

In [5]:
target = 'Price'
X_cat = ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color', 'Compartments_c', 'wc_i']
X_num = ['Compartments', 'Weight Capacity (kg)']

# Target Encoding

In [53]:
def get_tgt_enc_cat(X, cv = kf, params = {'random_state': 123}):
    tgt = TargetEncoder(target_type  = 'continuous', **params)
    tgt_list = list()
    for train_idx, valid_idx in cv.split(df_train[X], df_train[target]):
        df_cv_train, df_valid = df_train.iloc[train_idx], df_train.iloc[valid_idx]
        df_cv_train, df_valid = df_cv_train, df_valid
        if len(X) == 0:
            rmse_list.append(df_valid[target].std())
            continue
        if len(X) > 1:
            tgt.fit(dproc.combine_cat(df_cv_train[X]).to_frame(), df_cv_train[target])
            tgt_list.append(
                pd.Series(tgt.transform(dproc.combine_cat(df_valid[X]).to_frame())[:, 0], index = df_valid.index, name = '_'.join(X))
            )
        else:
            tgt.fit(df_cv_train[X], df_cv_train[target])
            tgt_list.append(
                pd.Series(tgt.transform(df_valid[X])[:, 0], index = df_valid.index, name = '_'.join(X))
            )
    return pd.concat(tgt_list)

def get_comb_cat_tgt(n, cv = kf, X_cat = X_cat):
    return pd.concat([
        get_tgt_enc_cat(list(X), cv) for X in combinations(X_cat, n)
    ], axis = 1)

In [7]:
df_tgt3 = sc.cache_result(
    'tgt_3',
    lambda : get_comb_cat_tgt(3), rerun = 0
).sort_index()
df_tgt4 = sc.cache_result(
    'tgt_4',
    lambda : get_comb_cat_tgt(4), rerun = 0
).sort_index()

In [8]:
s_rmse_tgt_3 = df_tgt3.apply(
    lambda x: root_mean_squared_error(df_train[target], x)
).sort_values()
s_rmse_tgt_4 = df_tgt4.apply(
    lambda x: root_mean_squared_error(df_train[target], x)
).sort_values()

In [9]:
s_rmse_tgt_3.iloc[:10]

Size_Waterproof_wc_i                  38.915715
Material_Waterproof_wc_i              38.915882
Waterproof_Color_wc_i                 38.916800
Laptop Compartment_Waterproof_wc_i    38.917450
Material_Size_wc_i                    38.917671
Brand_Size_wc_i                       38.918317
Brand_Waterproof_wc_i                 38.918337
Size_Color_wc_i                       38.918495
Material_Color_wc_i                   38.919004
Material_Laptop Compartment_wc_i      38.919357
dtype: float64

In [10]:
s_rmse_tgt_4.iloc[:10]

Size_Laptop Compartment_Waterproof_wc_i        38.915788
Material_Size_Waterproof_wc_i                  38.916880
Material_Laptop Compartment_Waterproof_wc_i    38.917673
Brand_Size_Waterproof_wc_i                     38.919630
Laptop Compartment_Waterproof_Color_wc_i       38.920032
Laptop Compartment_Waterproof_Style_wc_i       38.920300
Size_Waterproof_Style_wc_i                     38.920473
Brand_Laptop Compartment_Waterproof_wc_i       38.920523
Size_Waterproof_Color_wc_i                     38.920787
Material_Waterproof_Style_wc_i                 38.920846
dtype: float64

In [11]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score

sc.cache_result(
    'tgt_lasso_rmse',
    lambda : cross_val_score(Lasso(), df_tgt3, df_train[target], cv = kf, scoring = 'neg_root_mean_squared_error'), rerun = 0
).mean()

-38.915641678692246

In [12]:
reg_ls = sc.cache_result(
    'lasso_tgt3',
    lambda : Lasso().fit(df_tgt3, df_train[target])
)
X_lasso_3 = df_tgt3.columns[reg_ls.coef_ > 0].tolist()
root_mean_squared_error(
    df_train[target],
    df_tgt3[X_lasso_3].mean(axis=1)
)   

38.90882819875456

In [13]:
reg_ls = sc.cache_result(
    'lasso_tgt4',
    lambda : Lasso().fit(df_tgt4, df_train[target])
)
X_lasso_4 = df_tgt4.columns[reg_ls.coef_ > 0].tolist()
root_mean_squared_error(
    df_train[target],
    df_tgt4[X_lasso_4].mean(axis=1)
)

38.902266681790344

In [14]:
root_mean_squared_error(
    df_train[target],
    pd.concat([
        df_tgt3[X_lasso_3],
        df_tgt4[X_lasso_4]
    ], axis=1).mean(axis=1)
)

38.903925210474405

In [16]:
reg_ls = sc.cache_result(
    'lasso_tgt34',
    lambda : Lasso().fit(pd.concat([df_tgt3[X_lasso_3], df_tgt4[X_lasso_4]], axis=1), df_train[target])
)

In [23]:
len(X_lasso_3), sum(reg_ls.coef_ == 0)

(8, 8)

In [39]:
rmse_mean = list()
for i in range(1, s_rmse_tgt_3.shape[0] + 1):
    rmse = root_mean_squared_error(
        df_train[target], df_tgt3[s_rmse_tgt_3.index[:i]].mean(axis=1).sort_index()
    )
    if len(rmse_mean) > 0 and rmse > rmse_mean[-1]:
        break
    rmse_mean.append(rmse)
rmse_mean

[38.91571485912435, 38.91215758926544, 38.9109039485939]

In [42]:
rmse_mean = list()
for i in range(1, s_rmse_tgt_4.shape[0] + 1):
    rmse = root_mean_squared_error(
        df_train[target], df_tgt4[s_rmse_tgt_4.index[:i]].mean(axis=1).sort_index()
    )
    if len(rmse_mean) > 0 and rmse > rmse_mean[-1]:
        break
    rmse_mean.append(rmse)
rmse_mean

[38.91578764645321,
 38.91048087298257,
 38.909067033883204,
 38.90719419120311,
 38.905932464100836]

In [56]:
root_mean_squared_error(
    df_train[target], 
    pd.concat([
        df_tgt3[s_rmse_tgt_3.index[:3]],
        df_tgt4[s_rmse_tgt_4.index[:5]]
    ], axis=1).mean(axis=1)
)

38.90694229799814

from [https://www.kaggle.com/code/cdeotte/two-baseline-models-lb-38-91](Two Baseline Models - [LB 38.91])

In [65]:
from cuml.preprocessing import TargetEncoder as TargetEncoder_cu
def get_tgt_enc_cat_cu(X, cv = kf, params = {}):
    tgt = TargetEncoder_cu(**params)
    tgt_list = list()
    for train_idx, valid_idx in cv.split(df_train[X], df_train[target]):
        df_cv_train, df_valid = df_train.iloc[train_idx], df_train.iloc[valid_idx]
        df_cv_train, df_valid = df_cv_train, df_valid
        if len(X) == 0:
            rmse_list.append(df_valid[target].std())
            continue
        if len(X) > 1:
            tgt.fit(dproc.combine_cat(df_cv_train[X]).to_frame(), df_cv_train[target])
            tgt_list.append(
                pd.Series(tgt.transform(dproc.combine_cat(df_valid[X]).to_frame()), index = df_valid.index, name = '_'.join(X))
            )
        else:
            tgt.fit(df_cv_train[X], df_cv_train[target])
            tgt_list.append(
                pd.Series(tgt.transform(df_valid[X]), index = df_valid.index, name = '_'.join(X))
            )
    return pd.concat(tgt_list)

def get_comb_cat_tgt_cu(n, cv = kf, X_cat = X_cat):
    return pd.concat([
        get_tgt_enc_cat_cu(list(X), cv) for X in combinations(X_cat, n)
    ], axis = 1)

In [49]:
root_mean_squared_error(
    df_train[target],
    sc.cache_result(
        'me_wc',
        lambda : get_tgt_enc_cat_cu(['Weight Capacity (kg)'], params={'n_folds': 25, 'smooth': 20, 'split_method': 'random', 'stat': 'mean'}).sort_index(), rerun = 1
    )
)

38.76850901376839