In [1]:
import os, sys

import pandas as pd
import polars as pl

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import scipy

import sklearn
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

import dproc, sgml, sgnn, sgpp, sgutil, custpp

print(sys.version)
for i in [pd, pl, mpl, sns, np, scipy, sklearn, lgb, xgb, cb]:
    try:
        print(i.__name__, i.__version__)
    except:
        pass

2025-02-06 10:17:25.674644: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1738837045.686250   82158 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1738837045.689789   82158 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-06 10:17:25.701931: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


3.12.6 (main, Sep 30 2024, 02:19:13) [GCC 9.4.0]
pandas 2.2.3
polars 1.12.0
matplotlib 3.8.4
seaborn 0.13.2
numpy 1.26.4
scipy 1.12.0
sklearn 1.5.2
xgboost 2.1.2
catboost 1.2.5


In [2]:
from itertools import combinations

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, KFold, ShuffleSplit, train_test_split
from cuml.preprocessing import TargetEncoder
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

In [3]:
sc = sgutil.SGCache('img', 'result')
ss = ShuffleSplit(n_splits = 1, random_state = 123)
kf = KFold(5, random_state = 123, shuffle=True)

files = {
    'train': os.path.join('data', 'train.csv'),
    'train_extra': os.path.join('data', 'training_extra.csv'),
    'test': os.path.join('data', 'test.csv'),
}

In [4]:
t = sc.cache_result(
    'pipeline_2',
    lambda : make_pipeline(
        sgpp.PolarsProcessor(), 
        sgpp.ExprProcessor({
            'Compartments_c' : pl.col('Compartments').cast(pl.String).cast(pl.Categorical)
        }),
        sgpp.PandasCoverter(index_col = 'id'),
        sgpp.ApplyWrapper(
            sgpp.CatArrangerFreq(1, na_value = 'Unknown'),
            ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']
        ), 
        custpp.WeightCapacityProcessor()
    ).fit(files['train']),
    rerun = 1
)
df_train = pd.concat(
    [t.transform(files['train']), t.transform(files['train_extra'])], axis = 0
)
df_test = t.transform(files['test'])

In [5]:
target = 'Price'
X_cat = ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color', 'Compartments_c', 'wc_i']
X_num = ['Compartments', 'Weight Capacity (kg)']

# Target Encoding Smooth: 20

In [6]:
def get_tgt_enc_cat(X, cv = kf, params = {}):
    tgt = TargetEncoder(**params) # cuml TargetEncoding faster than sklearn TargetEncoding
    tgt_list = list()
    for train_idx, valid_idx in cv.split(df_train[X], df_train[target]):
        df_cv_train, df_valid = df_train.iloc[train_idx], df_train.iloc[valid_idx]
        df_cv_train, df_valid = df_cv_train, df_valid
        if len(X) == 0:
            rmse_list.append(df_valid[target].std())
            continue
        if len(X) > 1:
            tgt.fit(dproc.combine_cat(df_cv_train[X]).rename('_'.join(X)).to_frame(), df_cv_train[target])
            tgt_list.append(
                pd.Series(tgt.transform(dproc.combine_cat(df_valid[X]).rename('_'.join(X)).to_frame()), index = df_valid.index, name = '_'.join(X))
            )
        else:
            tgt.fit(df_cv_train[X], df_cv_train[target])
            tgt_list.append(
                pd.Series(tgt.transform(df_valid[X]), index = df_valid.index, name = '_'.join(X))
            )
    return pd.concat(tgt_list)

def get_comb_cat_tgt(n, cv = kf, X_cat = X_cat, params = {}):
    return pd.concat([
        get_tgt_enc_cat(list(X), cv, params = params) for X in combinations(X_cat, n)
    ], axis = 1)

In [7]:
def tgt4_lasso_mean():
    df_tgt4 =  get_comb_cat_tgt(4, params = {'smooth': 20, 'split_method': 'random', 'stat': 'mean'})
    s_rmse_tgt_4 = df_tgt4.apply(
        lambda x: root_mean_squared_error(df_train[target], x)
    ).sort_values()
    reg_ls = sc.cache_result(
        'lasso_tgt4',
        lambda : Lasso().fit(df_tgt4, df_train[target])
    )
    X_lasso_4 = df_tgt4.columns[reg_ls.coef_ > 0].tolist()
    return root_mean_squared_error(
        df_train[target],
        df_tgt4[X_lasso_4].mean(axis=1)
    )
sc.cache_result(
    'lasso_tgt4',
    tgt4_lasso_mean
)

In [8]:
root_mean_squared_error(
    df_train[target],
    sc.cache_result(
        'cat_all_tgt',
        lambda : get_tgt_enc_cat(X_cat, params = {'smooth': 20, 'split_method': 'random', 'stat': 'mean'})
    ).sort_index()
)

39.00885141272867

In [9]:
sc.cache_result(
    'cat_m1_tgt',
    lambda : get_comb_cat_tgt(len(X_cat) - 1, params = {'smooth': 20, 'split_method': 'random', 'stat': 'mean'}).sort_index().apply(
        lambda x: root_mean_squared_error(df_train[target], x)
    )
).sort_values()

Brand_Material_Size_Laptop Compartment_Style_Color_Compartments_c_wc_i          39.055075
Brand_Material_Size_Waterproof_Style_Color_Compartments_c_wc_i                  39.057344
Brand_Material_Size_Laptop Compartment_Waterproof_Color_Compartments_c_wc_i     39.084873
Brand_Material_Laptop Compartment_Waterproof_Style_Color_Compartments_c_wc_i    39.085773
Brand_Material_Size_Laptop Compartment_Waterproof_Style_Color_Compartments_c    39.098990
Brand_Size_Laptop Compartment_Waterproof_Style_Color_Compartments_c_wc_i        39.109608
Material_Size_Laptop Compartment_Waterproof_Style_Color_Compartments_c_wc_i     39.115133
Brand_Material_Size_Laptop Compartment_Waterproof_Style_Color_wc_i              39.127523
Brand_Material_Size_Laptop Compartment_Waterproof_Style_Compartments_c_wc_i     39.131239
dtype: float64

In [10]:
sc.cache_result(
    'cat_m5_tgt',
    lambda : get_comb_cat_tgt(5, params = {'smooth': 20, 'split_method': 'random', 'stat': 'mean'}).sort_index().apply(
        lambda x: root_mean_squared_error(df_train[target], x)
    )
).sort_values()

Material_Size_Laptop Compartment_Waterproof_wc_i     38.924221
Brand_Material_Size_Laptop Compartment_Waterproof    38.924672
Material_Size_Laptop Compartment_Waterproof_Color    38.925496
Size_Laptop Compartment_Waterproof_Style_wc_i        38.925695
Brand_Size_Laptop Compartment_Waterproof_Color       38.926433
                                                       ...    
Material_Size_Color_Compartments_c_wc_i              39.014720
Material_Style_Color_Compartments_c_wc_i             39.018171
Brand_Size_Color_Compartments_c_wc_i                 39.030616
Brand_Style_Color_Compartments_c_wc_i                39.033693
Brand_Material_Color_Compartments_c_wc_i             39.054523
Length: 126, dtype: float64

In [11]:
df_train['Weight Capacity (kg)'].value_counts().value_counts()

count
1        581744
2        146612
3         57418
4         29975
5         17687
          ...  
454           1
453           1
447           1
440           1
58087         1
Name: count, Length: 535, dtype: int64

The effect of "smooth 20" is assumed to have an effect only on Weight Capacity (kg) and, when treated as a categorical variable, it has high cardinality.

Since it contains many low-frequency values, it appears to improve performance by preventing overfitting in Target Encoding.

By consolidating low-frequency categories, we attempt to enhance its effectiveness as a categorical variable.

In [12]:
df_train.sort_values('Weight Capacity (kg)').pipe(
    lambda x: pd.concat([
        x[target].rolling(300, 1, True).sum().rename('sum'),
        x[target].rolling(300, 1, True).count().rename('count'),
        x[target]
    ], axis = 1).assign(
        prd = lambda x: (x['sum'] - x[target]) / (x['count'] - 1)
    ).pipe(
        lambda x: root_mean_squared_error(x['Price'], x['prd'])
    )
)

38.85714510144255

In [13]:
X = ['Weight Capacity (kg)']
cv = kf
q = np.linspace(0, 1, 301)
tgt = TargetEncoder()
tgt_list = list()
for train_idx, valid_idx in cv.split(df_train[X], df_train[target]):
    df_cv_train, df_valid = df_train.iloc[train_idx], df_train.iloc[valid_idx]
    df_cv_train, df_valid = df_cv_train, df_valid
    qt = df_cv_train['Weight Capacity (kg)'].quantile(q)
    qt.iloc[[0, -1]] = [-np.inf, np.inf]
    tgt.fit(pd.cut(df_cv_train[X[0]], qt, duplicates = 'drop').cat.codes.rename('qt').to_frame(), df_cv_train[target])
    tgt_list.append(
        pd.Series(tgt.transform(pd.cut(df_valid[X[0]], qt, duplicates = 'drop').cat.codes.rename('qt').to_frame()), index = df_valid.index, name = '_'.join(X))
    )
prd = pd.concat(tgt_list)
root_mean_squared_error(
    df_train[target],
    prd.sort_index()
)

38.9131642322593

In [17]:
X = ['Weight Capacity (kg)']
cv = kf
tgt = TargetEncoder()
tgt_list = list()
for train_idx, valid_idx in cv.split(df_train[X], df_train[target]):
    df_cv_train, df_valid = df_train.iloc[train_idx], df_train.iloc[valid_idx]
    df_cv_train, df_valid = df_cv_train, df_valid
    caf = sgpp.CatArrangerFreq(min_frequency = 30, unknown_value = "Unknown", na_value =  "Unknown")
    tgt.fit(caf.fit_transform(df_cv_train[X].astype('str').astype('category')) , df_cv_train[target])
    tgt_list.append(
        pd.Series(tgt.transform(
            caf.transform(df_valid[X].astype('str').astype('category'))
        ), index = df_valid.index, name = '_'.join(X))
    )
prd = pd.concat(tgt_list)
root_mean_squared_error(
    df_train[target],
    prd.sort_index()
)

38.83270096867579

In [117]:
df_train['Weight Capacity (kg)'].value_counts().pipe(
    lambda x: x[x >= 30]
)

Weight Capacity (kg)
5.000000     58087
30.000000     2588
11.898250     2120
14.908437     1627
22.898382     1479
             ...  
19.073242       30
19.601625       30
8.032797        30
7.042813        30
28.002974       30
Name: count, Length: 24765, dtype: int64