In [1]:
import os, sys

import pandas as pd
import polars as pl

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import scipy

import sklearn
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

import dproc, sgml, sgnn, sgpp, sgutil, custpp

print(sys.version)
for i in [pd, pl, mpl, sns, np, scipy, sklearn, lgb, xgb, cb]:
    try:
        print(i.__name__, i.__version__)
    except:
        pass

2025-02-07 02:20:16.893382: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1738894817.019331   13180 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1738894817.069786   13180 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-07 02:20:17.410784: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


3.12.6 (main, Sep 30 2024, 02:19:13) [GCC 9.4.0]
pandas 2.2.3
polars 1.12.0
matplotlib 3.8.4
seaborn 0.13.2
numpy 1.26.4
scipy 1.12.0
sklearn 1.5.2
xgboost 2.1.2
catboost 1.2.5


In [2]:
from itertools import combinations

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, KFold, ShuffleSplit, train_test_split
from sklearn.preprocessing import TargetEncoder
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

In [4]:
sc = sgutil.SGCache('img', 'result')
ss = ShuffleSplit(n_splits = 1, train_size = 0.8, random_state = 123)
kf = KFold(5, random_state = 123, shuffle=True)

files = {
    'train': os.path.join('data', 'train.csv'),
    'train_extra': os.path.join('data', 'training_extra.csv'),
    'test': os.path.join('data', 'test.csv'),
}

t = sc.cache_result(
    'pipeline_2',
    lambda : make_pipeline(
        sgpp.PolarsProcessor(), 
        sgpp.ExprProcessor({
            'Compartments_c' : pl.col('Compartments').cast(pl.String).cast(pl.Categorical)
        }),
        sgpp.PandasCoverter(index_col = 'id'),
        sgpp.ApplyWrapper(
            sgpp.CatArrangerFreq(1, na_value = 'Unknown'),
            ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']
        ), 
        custpp.WeightCapacityProcessor()
    ).fit(files['train']),
    rerun = 1
)
df_train = pd.concat(
    [t.transform(files['train']), t.transform(files['train_extra'])], axis = 0
)
df_test = t.transform(files['test'])

target = 'Price'
X_cat = ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color', 'Compartments_c']
X_num = ['Weight Capacity (kg)']

In [34]:
def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size = validation_fraction)

config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict(df[X]), index = df.index).clip(15, 150),
    'score_func': lambda df, prds: root_mean_squared_error(df[target].sort_index(), prds.sort_index()),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'y': target,
}

cb_adapter = sgml.CBAdapter(cb.CatBoostRegressor)
lr_adapter = sgml.SklearnAdapter(LinearRegression)
lgb_adapter = sgml.LGBMAdapter(lgb.LGBMRegressor)
xgb_adapter = sgml.XGBAdapter(xgb.XGBRegressor)

In [29]:
tgt = TargetEncoder(smooth = 35, random_state = 123)
for train_idx, valid_idx in kf.split(df_train, df_train[target]):
    df_cv_train, df_valid = df_train.iloc[train_idx], df_train.iloc[valid_idx]
    bidx = df_valid['Weight Capacity (kg)'].isin(df_cv_train['Weight Capacity (kg)'].unique()) & df_valid['Weight Capacity (kg)'].notna() &\
            df_valid['Weight Capacity (kg)'].between(5, 30, inclusive = 'neither')
    df_valid, df_valid2 = df_valid.loc[bidx], df_valid.loc[~bidx]
    tgt.fit(df_cv_train[['wc_i2']], df_cv_train[target].clip(df_cv_train[target].quantile(0.018), 150))
    print(
        root_mean_squared_error(df_valid[target], tgt.transform(df_valid[['wc_i2']])[:, 0]),
        root_mean_squared_error(df_valid2[target], tgt.transform(df_valid2[['wc_i2']])[:, 0]),
        df_valid2[target].std()
    )

38.629543809953795 39.29101486311971 39.316704
38.64572975855934 39.20963300258326 39.23759
38.681409998918504 39.19875053182947 39.218143
38.64570263604047 39.18592044966053 39.207096
38.651380676657986 39.23936759986119 39.262196


In [38]:
df_cv_train[target].clip(df_cv_train[target].quantile(0.018), 150).value_counts()

Price
150.000000    57556
17.466961     57531
39.834351       157
38.375099       154
27.485460       154
              ...  
24.717649        33
81.629303        32
94.077904        32
117.152290       32
141.197388       31
Name: count, Length: 47595, dtype: int64

In [67]:
tgt = TargetEncoder(smooth = 35, random_state = 123, target_type='continuous')
for train_idx, valid_idx in ss.split(df_train, df_train[target]):
    df_cv_train, df_valid = df_train.iloc[train_idx], df_train.iloc[valid_idx]
    tgt.fit(df_cv_train[['Weight Capacity (kg)']], df_cv_train[target].clip(df_cv_train[target].quantile(0.018), 150))
    df_cv_train = df_cv_train.assign(
        tgt = lambda x: tgt.transform(x[['wc_i2']])[:, 0]
    )
    df_valid = df_valid
    bidx = df_valid['Weight Capacity (kg)'].isin(df_cv_train['Weight Capacity (kg)'].unique()) & df_valid['Weight Capacity (kg)'].notna() &\
            df_valid['Weight Capacity (kg)'].between(5, 30, inclusive = 'neither')
    df_valid1, df_valid2 = df_valid.loc[bidx], df_valid.loc[~bidx]
    df_valid1 = df_valid1.assign(
        tgt = lambda x: tgt.transform(x[['wc_i2']])[:, 0]
    )
    reg_cb = sc.cache_result(
        'cb_ss_2',
        lambda : sgml.train(df_cv_train, {
                'model_params' : {'n_estimators': 500, 'learning_rate': 0.1},
                'X_num': X_num, 'X_cat': X_cat,
                #'validation_fraction': 0.1
            }, config, cb_adapter, task_type = 'GPU'), rerun = 1
    )
    reg_cb2 = sc.cache_result(
        'cb2_ss_2',
        lambda : sgml.train(df_cv_train, {
                'model_params' : {'n_estimators': 500, 'learning_rate': 0.1},
                'X_cat': X_cat, 'X_num': ['tgt'],
                # 'validation_fraction': 0.1
            }, config, cb_adapter, task_type = 'GPU'), rerun = 1
    )

    s_merge = pd.concat([
        pd.Series(
            make_pipeline(reg_cb2[0]['preprocessor'], reg_cb2[0]['model']).predict(df_valid1[reg_cb2[1]]), index = df_valid1.index
        ),
        pd.Series(
            make_pipeline(reg_cb[0]['preprocessor'], reg_cb[0]['model']).predict(df_valid2[reg_cb[1]]), index = df_valid2.index
        )
    ], axis = 0)
    print(
        root_mean_squared_error(df_valid[target].sort_index(), s_merge.sort_index()),
        root_mean_squared_error(
            df_valid1[target], make_pipeline(reg_cb2[0]['preprocessor'], reg_cb2[0]['model']).predict(df_valid1[reg_cb2[1]])
        ),
        root_mean_squared_error(df_cv_train[target], df_cv_train['tgt']),
        root_mean_squared_error(df_valid1[target], df_valid1['tgt']),
        root_mean_squared_error(
            df_valid2[target], make_pipeline(reg_cb[0]['preprocessor'], reg_cb[0]['model']).predict(df_valid2[reg_cb[1]])
        ), df_valid2[target].std(), bidx.mean()
    )


SyntaxError: invalid syntax. Perhaps you forgot a comma? (1480560068.py, line 27)

In [70]:
result = sgml.train(df_cv_train, {
    'model_params' : {'n_estimators': 500, 'learning_rate': 0.1},
    'X_cat': X_cat, 'X_num': ['tgt'],
    'validation_fraction': 0.1
}, config, cb_adapter)# , task_type = 'GPU')

Round:   0%|          | 0/500 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [69]:
result

({'variables': array(['cat__Brand', 'cat__Material', 'cat__Size',
         'cat__Laptop Compartment', 'cat__Waterproof', 'cat__Style',
         'cat__Color', 'cat__Compartments_c', 'pt__tgt'], dtype=object),
  'valid_shape': (319546, 9),
  'train_shape': (2875908, 9),
  'target': 'Price',
  'model': <catboost.core.CatBoostRegressor at 0x7f9f0b7e9730>,
  'preprocessor': ColumnTransformer(transformers=[('cat', 'passthrough',
                                   ['Brand', 'Material', 'Size',
                                    'Laptop Compartment', 'Waterproof', 'Style',
                                    'Color', 'Compartments_c']),
                                  ('pt', 'passthrough', ['tgt'])])},
 ['Color',
  'Size',
  'Brand',
  'Style',
  'tgt',
  'Waterproof',
  'Laptop Compartment',
  'Compartments_c',
  'Material'])

In [65]:
root_mean_squared_error(
    df_cv_train[target], make_pipeline(reg_cb2[0]['preprocessor'], reg_cb2[0]['model']).predict(df_cv_train[reg_cb2[1]])
)

35.834233403576626

In [62]:
df_valid1.assign(
    tgt = lambda x: tgt.transform(x[['wc_i2']])[:, 0]
)

Unnamed: 0_level_0,Compartments,Weight Capacity (kg),Price,Compartments_c,Brand,Material,Size,Laptop Compartment,Waterproof,Style,Color,wc_i,wc_i2,tgt
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
3270974,1.0,28.777822,50.810810,1.0,Under Armour,Polyester,Medium,Yes,Yes,Tote,Green,29,28.777822,80.817437
1454687,8.0,22.868895,99.373962,8.0,Under Armour,Polyester,Small,Yes,No,Tote,Unknown,23,22.868895,78.494267
2243079,2.0,27.759537,64.908417,2.0,Adidas,Polyester,Small,No,Yes,Messenger,Unknown,28,27.759537,73.829811
3993541,10.0,27.205019,122.736153,10.0,Adidas,Polyester,Small,No,Yes,Tote,Blue,27,27.205019,78.928558
1796832,4.0,20.890936,69.983040,4.0,Jansport,Canvas,Medium,No,No,Unknown,Black,21,20.890936,81.817118
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1965759,8.0,24.566753,44.687130,8.0,Jansport,Nylon,Small,No,No,Tote,Pink,25,24.566753,81.660871
1649933,2.0,16.627954,27.586510,2.0,Nike,Leather,Medium,Yes,No,Tote,Pink,17,16.627954,83.571869
748619,4.0,20.869415,111.622261,4.0,Puma,Polyester,Small,Yes,No,Backpack,Gray,21,20.869415,79.942915
1599911,4.0,28.922705,105.210213,4.0,Adidas,Leather,Medium,No,Yes,Tote,Gray,29,28.922705,79.913353


In [60]:
tgt.encodings_[0].shape

(793111,)