In [None]:
# install fastkaggle if not available
try: import fastkaggle
except ModuleNotFoundError:
    !pip install -Uq fastkaggle

from fastkaggle import *

## Getting set up

In [None]:
comp = 'playground-series-s3e25'
path = setup_comp(comp, install='')

In [None]:
path

Path('playground-series-s3e25')

In [None]:
trn_path = path/'train.csv'

In [None]:
import pandas as pd
df = pd.read_csv(trn_path)

In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, FunctionTransformer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import HistGradientBoostingRegressor, HistGradientBoostingClassifier
from sklearn.metrics import make_scorer, mean_absolute_error, median_absolute_error
from scipy.stats import loguniform
from sklearn.impute import SimpleImputer

In [None]:
X = df.iloc[:,1:-1]
y = df.iloc[:,-1]
X_tr, X_dev, y_tr, y_dev = train_test_split(X,y,test_size=0.2) # XXyy

In [None]:
tfs = [FunctionTransformer(lambda o: np.log(1+o)), StandardScaler(), RobustScaler(), MinMaxScaler()]
# model = HistGradientBoostingRegressor(loss='quantile', quantile=0.001)

for scaler in tfs:
    model = HistGradientBoostingClassifier()
    pipe = make_pipeline(SimpleImputer(),scaler, model)
    pipe.fit(X_tr,y_tr.astype(int))
    y_pred =  pipe.predict(X_dev)
    print(np.abs(y_pred-y_dev).median())

0.5
0.5
0.5
0.5


In [None]:
y.value_counts().reset_index().sort_values('Hardness')

Unnamed: 0,Hardness,count
36,1.0,2
28,1.3,18
15,1.5,174
16,1.8,158
6,2.0,388
37,2.1,1
43,2.2,1
11,2.3,292
2,2.5,1089
46,2.6,1


In [None]:
pd.cut(y, targets).isna().sum()

0

In [None]:
pd.cut(y, targets).apply(lambda o:o.right)

0        6.0
1        6.5
2        2.5
3        6.0
4        6.0
        ... 
10402    4.0
10403    5.0
10404    2.0
10405    6.0
10406    6.5
Name: Hardness, Length: 10407, dtype: category
Categories (40, float64): [0.25 < 0.50 < 0.75 < 1.00 ... 9.25 < 9.50 < 9.75 < 10.00]

In [None]:
targets[pd.cut(y_tr,targets).cat.codes], y_tr

(array([2.75, 5.25, 1.75, ..., 4.75, 2.75, 2.25]),
 6787    3.0
 183     5.5
 3908    2.0
 9369    2.5
 7527    2.0
        ... 
 6152    6.0
 8830    2.3
 8048    5.0
 9885    2.8
 4895    2.5
 Name: Hardness, Length: 8325, dtype: float64)

In [None]:
HistGradientBoostingClassifier().fit(X_tr, pd.cut(y_tr,targets).cat.codes)

In [None]:
tfs = [FunctionTransformer(lambda o: np.log(1+o)), StandardScaler(), RobustScaler(), MinMaxScaler()]

for scaler in tfs:
    model = HistGradientBoostingClassifier()
    pipe = make_pipeline(scaler, model)
    pipe.fit(X_tr, (y_tr*3.0).astype(int))
    y_pred = pipe.predict(X_dev)/3.0
    print(np.abs(y_pred-y_dev).median())

1.0
1.1666666666666665
1.3666666666666667
1.0


## Submitting to Kaggle

In [None]:
ss = pd.read_csv(path/'sample_submission.csv')
ss

Unnamed: 0,id,Hardness
0,10407,4.647
1,10408,4.647
2,10409,4.647
3,10410,4.647
4,10411,4.647
...,...,...
6934,17341,4.647
6935,17342,4.647
6936,17343,4.647
6937,17344,4.647


In [None]:
tst = pd.read_csv(path/'test.csv')

In [None]:
pipe.predict(tst.iloc[:,1:])

array([2, 2, 6, ..., 5, 5, 3])

In [None]:
ss['Hardness'] = pipe.predict(tst.iloc[:,1:])

In [None]:
ss.to_csv('subm.csv', index=False)
!head subm.csv

id,Hardness
10407,2
10408,2
10409,6
10410,2
10411,6
10412,6
10413,3
10414,6
10415,1


In [None]:
if not iskaggle:
    from kaggle import api
    api.competition_submit_cli('subm.csv', 'initial log tf HistGBR', comp)

100%|█████████████████████████████| 54.2k/54.2k [00:00<00:00, 76.0kB/s]


## Conclusion

## Addendum

In [None]:
if not iskaggle:
    push_notebook('xy', 'histgbr-minmax-transform',
                  title='Minmax transform and HistGBR model',
                  file='01-histgbr-minmax-transform.ipynb',
                  competition=comp, private=False, gpu=False)

Your kernel title does not resolve to the specified id. This may result in surprising behavior. We suggest making your title something that resolves to the specified id. See https://en.wikipedia.org/wiki/Clean_URL#Slug for more information on how slugs are determined.
Kernel version 1 successfully pushed.  Please check progress at https://www.kaggle.com/code/xiaochuanyang/minmax-transform-and-histgbr-model
