Based on Faron's script that was adopted in the following notebook: https://www.kaggle.com/tunguz/another-xgb-allstate-starter/

In [1]:
%load_ext watermark

In [2]:
%watermark

Last updated: 2023-05-24T10:51:21.587398-04:00

Python implementation: CPython
Python version       : 3.9.6
IPython version      : 8.10.0

Compiler    : Clang 14.0.3 (clang-1403.0.22.14.1)
OS          : Darwin
Release     : 22.4.0
Machine     : arm64
Processor   : arm
CPU cores   : 20
Architecture: 64bit



In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import gc
import xgboost as xgb

In [4]:
%watermark --iversions

numpy  : 1.23.5
pandas : 2.0.0
xgboost: 1.7.5



In [5]:
ID = 'id'
TARGET = 'loss'
SEED = 0
DATA_DIR = "../input"

TRAIN_FILE = f"{DATA_DIR}/train.csv.zip"
TEST_FILE = f"{DATA_DIR}/test.csv.zip"
SUBMISSION_FILE = f"{DATA_DIR}/sample_submission.csv.zip"


In [6]:
train = pd.read_csv(TRAIN_FILE)
test = pd.read_csv(TEST_FILE)

y_train = train[TARGET].ravel()

train.drop([ID, TARGET], axis=1, inplace=True)
test.drop([ID], axis=1, inplace=True)

print(f"{train.shape},{test.shape}")
      
ntrain = train.shape[0]

(188318, 130),(125546, 130)


In [7]:
train_test = pd.concat((train, test)).reset_index(drop=True)

features = train.columns

cats = [feat for feat in features if 'cat' in feat]
for feat in cats:
    train_test[feat] = pd.factorize(train_test[feat], sort=True)[0]

print(train_test.head())

x_train = np.array(train_test.iloc[:ntrain,:])
x_test = np.array(train_test.iloc[ntrain:,:])

print(f"{train.shape},{test.shape}")


   cat1  cat2  cat3  cat4  cat5  cat6  cat7  cat8  cat9  cat10  ...     cont5   
0     0     1     0     1     0     0     0     0     1      0  ...  0.310061  \
1     0     1     0     0     0     0     0     0     1      1  ...  0.885834   
2     0     1     0     0     1     0     0     0     1      1  ...  0.397069   
3     1     1     0     1     0     0     0     0     1      0  ...  0.422268   
4     0     1     0     1     0     0     0     0     1      1  ...  0.704268   

      cont6     cont7    cont8    cont9   cont10    cont11    cont12   
0  0.718367  0.335060  0.30260  0.67135  0.83510  0.569745  0.594646  \
1  0.438917  0.436585  0.60087  0.35127  0.43919  0.338312  0.366307   
2  0.289648  0.315545  0.27320  0.26076  0.32446  0.381398  0.373424   
3  0.440945  0.391128  0.31796  0.32128  0.44467  0.327915  0.321570   
4  0.178193  0.247408  0.24564  0.22089  0.21230  0.204687  0.202213   

     cont13    cont14  
0  0.822493  0.714843  
1  0.611431  0.304496  
2  0.195

In [8]:
%%time
gbdt = HistGradientBoostingRegressor()
gbdt.fit(x_train, y_train)

CPU times: user 31.8 s, sys: 1min 31s, total: 2min 3s
Wall time: 8.14 s


In [9]:
submission = pd.read_csv(SUBMISSION_FILE)
submission.iloc[:, 1] = gbdt.predict(x_test)
submission.to_csv('../submissions/hgbr_starter_0_sub.csv.zip', index=None, compression='zip')
submission.head()

Unnamed: 0,id,loss
0,4,1872.537859
1,6,2211.400116
2,9,10059.858767
3,12,6102.662251
4,15,1018.931559


Private score: 1194.93, Public Score: 1184.62

In [10]:
train_oof = np.zeros((x_train.shape[0], ))
test_preds = 0
train_oof.shape

(188318,)

In [11]:
%%time
n_splits = 5
kf = KFold(n_splits=n_splits, random_state=137, shuffle=True)

for jj, (train_index, val_index) in enumerate(kf.split(x_train)):
    print("Fitting fold", jj+1)
    train_features = x_train[train_index]
    train_target = y_train[train_index]
    
    val_features = x_train[val_index]
    val_target = y_train[val_index]
    
    model = HistGradientBoostingRegressor()
    model.fit(train_features, train_target)
    val_pred = model.predict(val_features)
    train_oof[val_index] = val_pred
    print("Fold mean absolute error:", mean_absolute_error(val_target, val_pred))
    test_preds += model.predict(x_test)/n_splits
    del train_features, train_target, val_features, val_target
    gc.collect()

Fitting fold 1
Fold mean absolute error: 1202.673847790443
Fitting fold 2
Fold mean absolute error: 1198.2230176878957
Fitting fold 3
Fold mean absolute error: 1201.7906920162657
Fitting fold 4
Fold mean absolute error: 1207.4997106095213
Fitting fold 5
Fold mean absolute error: 1201.4401438164473
CPU times: user 2min 45s, sys: 8min 23s, total: 11min 9s
Wall time: 43.4 s


In [12]:
print("Total mean absolute error:", mean_absolute_error(y_train, train_oof))

Total mean absolute error: 1202.325459609395


In [13]:
submission.iloc[:, 1] = test_preds
submission.to_csv(f'../submissions/hgbr_starter_{n_splits}fold_0_sub.csv.zip', index=None, compression='zip')
submission.head()

Unnamed: 0,id,loss
0,4,1864.759539
1,6,2140.348246
2,9,9586.81799
3,12,5984.247765
4,15,1108.202886


Private score: 1191.10, Public Score: 1182.04

In [14]:
%%time

train_oof = np.zeros((x_train.shape[0], ))
test_preds = 0
train_oof.shape

n_splits = 5
kf = KFold(n_splits=n_splits, random_state=137, shuffle=True)

for jj, (train_index, val_index) in enumerate(kf.split(x_train)):
    print("Fitting fold", jj+1)
    train_features = x_train[train_index]
    train_target = y_train[train_index]
    
    val_features = x_train[val_index]
    val_target = y_train[val_index]
    
    model = HistGradientBoostingRegressor(validation_fraction=None)
    model.fit(train_features, train_target)
    val_pred = model.predict(val_features)
    train_oof[val_index] = val_pred
    print("Fold mean absolute error:", mean_absolute_error(val_target, val_pred))
    test_preds += model.predict(x_test)/n_splits
    del train_features, train_target, val_features, val_target
    gc.collect()
    
print("Total mean absolute error:", mean_absolute_error(y_train, train_oof))

submission.iloc[:, 1] = test_preds
submission.to_csv(f'../submissions/hgbr_starter_no_val_fraction_{n_splits}fold_0_sub.csv.zip', index=None, compression='zip')
submission.head()

Fitting fold 1
Fold mean absolute error: 1197.5765785634965
Fitting fold 2
Fold mean absolute error: 1196.8647165779423
Fitting fold 3
Fold mean absolute error: 1199.4986048485966
Fitting fold 4
Fold mean absolute error: 1206.2649664021365
Fitting fold 5
Fold mean absolute error: 1199.6789125600642
Total mean absolute error: 1199.9767239805967
CPU times: user 3min 33s, sys: 9min 47s, total: 13min 20s
Wall time: 50.9 s


Unnamed: 0,id,loss
0,4,1859.107804
1,6,2121.987592
2,9,9647.282584
3,12,5925.574101
4,15,1128.111563


Private score: 1190.65, Public Score: 1182.07