Based on Faron's script that was adopted in the following notebook: https://www.kaggle.com/tunguz/another-xgb-allstate-starter/

In [1]:
%load_ext watermark

In [2]:
%watermark

Last updated: 2023-05-24T11:10:26.795551-04:00

Python implementation: CPython
Python version       : 3.9.6
IPython version      : 8.10.0

Compiler    : Clang 14.0.3 (clang-1403.0.22.14.1)
OS          : Darwin
Release     : 22.4.0
Machine     : arm64
Processor   : arm
CPU cores   : 20
Architecture: 64bit



In [3]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import gc
import xgboost as xgb

In [4]:
%watermark --iversions

pandas : 2.0.0
numpy  : 1.23.5
xgboost: 1.7.5



In [5]:
ID = 'id'
TARGET = 'loss'
SEED = 0
DATA_DIR = "../input"

TRAIN_FILE = f"{DATA_DIR}/train.csv.zip"
TEST_FILE = f"{DATA_DIR}/test.csv.zip"
SUBMISSION_FILE = f"{DATA_DIR}/sample_submission.csv.zip"


In [6]:
train = pd.read_csv(TRAIN_FILE)
test = pd.read_csv(TEST_FILE)

y_train = train[TARGET].ravel()

train.drop([ID, TARGET], axis=1, inplace=True)
test.drop([ID], axis=1, inplace=True)

print(f"{train.shape},{test.shape}")
      
ntrain = train.shape[0]

(188318, 130),(125546, 130)


In [7]:
train_test = pd.concat((train, test)).reset_index(drop=True)

features = train.columns

cats = [feat for feat in features if 'cat' in feat]
for feat in cats:
    train_test[feat] = pd.factorize(train_test[feat], sort=True)[0]

print(train_test.head())

x_train = np.array(train_test.iloc[:ntrain,:])
x_test = np.array(train_test.iloc[ntrain:,:])

print(f"{train.shape},{test.shape}")


   cat1  cat2  cat3  cat4  ...    cont11    cont12    cont13    cont14
0     0     1     0     1  ...  0.569745  0.594646  0.822493  0.714843
1     0     1     0     0  ...  0.338312  0.366307  0.611431  0.304496
2     0     1     0     0  ...  0.381398  0.373424  0.195709  0.774425
3     1     1     0     1  ...  0.327915  0.321570  0.605077  0.602642
4     0     1     0     1  ...  0.204687  0.202213  0.246011  0.432606

[5 rows x 130 columns]
(188318, 130),(125546, 130)


In [8]:
%%time
gbdt = LGBMRegressor()
gbdt.fit(x_train, y_train)

CPU times: user 4.24 s, sys: 7.62 s, total: 11.9 s
Wall time: 2.17 s


In [9]:
submission = pd.read_csv(SUBMISSION_FILE)
submission.iloc[:, 1] = gbdt.predict(x_test)
submission.to_csv('../submissions/lgbm_starter_0_sub.csv.zip', index=None, compression='zip')
submission.head()

Unnamed: 0,id,loss
0,4,1881.360763
1,6,2103.158188
2,9,9701.287385
3,12,5840.425772
4,15,1177.210988


Private score: 1194.53, Public Score: 1184.95

In [10]:
train_oof = np.zeros((x_train.shape[0], ))
test_preds = 0
train_oof.shape

(188318,)

In [11]:
%%time
n_splits = 5
kf = KFold(n_splits=n_splits, random_state=137, shuffle=True)

for jj, (train_index, val_index) in enumerate(kf.split(x_train)):
    print("Fitting fold", jj+1)
    train_features = x_train[train_index]
    train_target = y_train[train_index]
    
    val_features = x_train[val_index]
    val_target = y_train[val_index]
    
    model = LGBMRegressor()
    model.fit(train_features, train_target)
    val_pred = model.predict(val_features)
    train_oof[val_index] = val_pred
    print("Fold mean absolute error:", mean_absolute_error(val_target, val_pred))
    test_preds += model.predict(x_test)/n_splits
    del train_features, train_target, val_features, val_target
    gc.collect()

Fitting fold 1
Fold mean absolute error: 1198.9661411160635
Fitting fold 2
Fold mean absolute error: 1196.9711425629957
Fitting fold 3
Fold mean absolute error: 1201.9087446655756
Fitting fold 4
Fold mean absolute error: 1207.4961376984218
Fitting fold 5
Fold mean absolute error: 1203.311136477671
CPU times: user 25.3 s, sys: 37.7 s, total: 1min 3s
Wall time: 11.3 s


In [12]:
print("Total mean absolute error:", mean_absolute_error(y_train, train_oof))

Total mean absolute error: 1201.7306214959087


In [13]:
submission.iloc[:, 1] = test_preds
submission.to_csv(f'../submissions/lgbm_starter_{n_splits}fold_0_sub.csv.zip', index=None, compression='zip')
submission.head()

Unnamed: 0,id,loss
0,4,1861.688305
1,6,2138.976906
2,9,9679.755663
3,12,5952.069958
4,15,1075.352652


Private score: 1190.48, Public Score: 1181.30