Based on Faron's script that was adopted in the following notebook: https://www.kaggle.com/tunguz/another-xgb-allstate-starter/

In [1]:
%load_ext watermark

In [2]:
%watermark

Last updated: 2023-05-24T11:21:01.523643-04:00

Python implementation: CPython
Python version       : 3.9.6
IPython version      : 8.10.0

Compiler    : Clang 14.0.3 (clang-1403.0.22.14.1)
OS          : Darwin
Release     : 22.4.0
Machine     : arm64
Processor   : arm
CPU cores   : 20
Architecture: 64bit



In [3]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import gc
import catboost as cb

In [4]:
%watermark --iversions

catboost: 1.1.1
pandas  : 2.0.0
numpy   : 1.23.5



In [5]:
ID = 'id'
TARGET = 'loss'
SEED = 0
DATA_DIR = "../input"

TRAIN_FILE = f"{DATA_DIR}/train.csv.zip"
TEST_FILE = f"{DATA_DIR}/test.csv.zip"
SUBMISSION_FILE = f"{DATA_DIR}/sample_submission.csv.zip"


In [6]:
train = pd.read_csv(TRAIN_FILE)
test = pd.read_csv(TEST_FILE)

y_train = train[TARGET].ravel()

train.drop([ID, TARGET], axis=1, inplace=True)
test.drop([ID], axis=1, inplace=True)

print(f"{train.shape},{test.shape}")
      
ntrain = train.shape[0]

(188318, 130),(125546, 130)


In [7]:
train_test = pd.concat((train, test)).reset_index(drop=True)

features = train.columns

cats = [feat for feat in features if 'cat' in feat]
for feat in cats:
    train_test[feat] = pd.factorize(train_test[feat], sort=True)[0]

print(train_test.head())

x_train = np.array(train_test.iloc[:ntrain,:])
x_test = np.array(train_test.iloc[ntrain:,:])

print(f"{train.shape},{test.shape}")


   cat1  cat2  cat3  cat4  ...    cont11    cont12    cont13    cont14
0     0     1     0     1  ...  0.569745  0.594646  0.822493  0.714843
1     0     1     0     0  ...  0.338312  0.366307  0.611431  0.304496
2     0     1     0     0  ...  0.381398  0.373424  0.195709  0.774425
3     1     1     0     1  ...  0.327915  0.321570  0.605077  0.602642
4     0     1     0     1  ...  0.204687  0.202213  0.246011  0.432606

[5 rows x 130 columns]
(188318, 130),(125546, 130)


In [8]:
%%time
gbdt = CatBoostRegressor(silent=True)
gbdt.fit(x_train, y_train)

CPU times: user 1min 12s, sys: 37.8 s, total: 1min 50s
Wall time: 9.21 s


<catboost.core.CatBoostRegressor at 0x2f33df9a0>

In [9]:
submission = pd.read_csv(SUBMISSION_FILE)
submission.iloc[:, 1] = gbdt.predict(x_test)
submission.to_csv('../submissions/cb_starter_0_sub.csv.zip', index=None, compression='zip')
submission.head()

Unnamed: 0,id,loss
0,4,1658.947695
1,6,2014.277471
2,9,9645.376449
3,12,6634.372559
4,15,1066.327247


Private score: 1172.56, Public Score: 1165.85

In [10]:
train_oof = np.zeros((x_train.shape[0], ))
test_preds = 0
train_oof.shape

(188318,)

In [11]:
%%time
n_splits = 5
kf = KFold(n_splits=n_splits, random_state=137, shuffle=True)

for jj, (train_index, val_index) in enumerate(kf.split(x_train)):
    print("Fitting fold", jj+1)
    train_features = x_train[train_index]
    train_target = y_train[train_index]
    
    val_features = x_train[val_index]
    val_target = y_train[val_index]
    
    model = CatBoostRegressor(silent=True)
    model.fit(train_features, train_target)
    val_pred = model.predict(val_features)
    train_oof[val_index] = val_pred
    print("Fold mean absolute error:", mean_absolute_error(val_target, val_pred))
    test_preds += model.predict(x_test)/n_splits
    del train_features, train_target, val_features, val_target
    gc.collect()

Fitting fold 1
Fold mean absolute error: 1181.4253904936747
Fitting fold 2
Fold mean absolute error: 1178.0230747538558
Fitting fold 3
Fold mean absolute error: 1182.6513232712928
Fitting fold 4
Fold mean absolute error: 1188.1975169152397
Fitting fold 5
Fold mean absolute error: 1181.058680690606
CPU times: user 5min 28s, sys: 2min 55s, total: 8min 23s
Wall time: 1min 5s


In [12]:
print("Total mean absolute error:", mean_absolute_error(y_train, train_oof))

Total mean absolute error: 1182.2711721938529


In [13]:
submission.iloc[:, 1] = test_preds
submission.to_csv(f'../submissions/cb_starter_{n_splits}fold_0_sub.csv.zip', index=None, compression='zip')
submission.head()

Unnamed: 0,id,loss
0,4,1698.928861
1,6,2036.730995
2,9,9840.936116
3,12,6798.009651
4,15,1029.869226


Private score: 1169.86, Public Score: 1162.58

In [28]:
%%time

train_oof = np.zeros((x_train.shape[0], ))
test_preds = 0


n_splits = 5
kf = KFold(n_splits=n_splits, random_state=137, shuffle=True)

for jj, (train_index, val_index) in enumerate(kf.split(x_train)):
    print("Fitting fold", jj+1)
    train_features = train.loc[train_index]
    train_target = y_train[train_index]
    
    val_features = train.loc[val_index]
    val_target = y_train[val_index]
    
    model = CatBoostRegressor(silent=True, cat_features=cats)
    model.fit(train_features, train_target)
    val_pred = model.predict(val_features)
    train_oof[val_index] = val_pred
    print("Fold mean absolute error:", mean_absolute_error(val_target, val_pred))
    test_preds += model.predict(test)/n_splits
    del train_features, train_target, val_features, val_target
    gc.collect()
    
print("Total mean absolute error:", mean_absolute_error(y_train, train_oof))

submission.iloc[:, 1] = test_preds
submission.to_csv(f'../submissions/cb_starter_w_cats_{n_splits}fold_0_sub.csv.zip', index=None, compression='zip')
submission.head()

Fitting fold 1
Fold mean absolute error: 1181.5677766937297
Fitting fold 2
Fold mean absolute error: 1176.2118915805302
Fitting fold 3
Fold mean absolute error: 1183.841229618414
Fitting fold 4
Fold mean absolute error: 1189.8671553424153
Fitting fold 5
Fold mean absolute error: 1182.8334069610928
Total mean absolute error: 1182.864255016869
CPU times: user 1h 41min 43s, sys: 4min 59s, total: 1h 46min 42s
Wall time: 7min 39s


Unnamed: 0,id,loss
0,4,1732.661042
1,6,2079.361713
2,9,9693.19347
3,12,6509.881707
4,15,954.322373


Private score: 1172.97, Public Score: 1164.06