In [13]:
!pip install --upgrade --force-reinstall scikit-learn==1.0.0 --quiet
!pip install --upgrade lightgbm==3.2.1 --quiet

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.6.0 requires numpy~=1.19.2, but you have numpy 1.21.2 which is incompatible.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.
albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.[0m


In [14]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, RepeatedKFold
from sklearn.metrics import mean_squared_error

import lightgbm as lgb

from tqdm.auto import tqdm

In [42]:
class Config:
    SEED = 3655

    N_SPLITS = 5
    N_REPEATS = 2

    EARLY_STOPPING_ROUNDS = 50
    VERBOSE_EVAL = 100

    MODEL_PARAMS = {
        'metric': 'rmse',

        'boosting': 'gbdt',

        "linear_tree": True,

        'num_leaves': 32,
        'learning_rate': 0.1,
        'max_depth': 8,
        'tree_learner': 'voting',
        'verbose': 1,
        'n_jobs': -1,
        }

    TRAIN_PARAMS = {
        'num_boost_round': 1_000
        }

In [40]:
data = fetch_california_housing()

X = pd.DataFrame(data=data['data'], columns=data['feature_names'])
y = data['target']
feature_names = data['feature_names']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=.8, test_size=.2, random_state=Config.SEED)

train_dataset = lgb.Dataset(
    data=X_train,
    label=y_train,
    feature_name=feature_names
)

valid_dataset = lgb.Dataset(
    data=X_valid,
    label=y_valid,
    feature_name=feature_names
)

In [44]:
es = lgb.callback.early_stopping(stopping_rounds=Config.EARLY_STOPPING_ROUNDS)
log_eval = lgb.callback.print_evaluation(period=Config.VERBOSE_EVAL)

model = lgb.train(
    params=Config.MODEL_PARAMS, 
    train_set=train_dataset, 
    valid_sets=[train_dataset, valid_dataset], 
    valid_names=['train', 'valid'], 
    callbacks=[
               es,
               log_eval
               ],
    **Config.TRAIN_PARAMS
    )

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1838
[LightGBM] [Info] Number of data points in the train set: 16512, number of used features: 8
[LightGBM] [Info] Start training from score 2.064059
[1]	train's rmse: 1.07415	valid's rmse: 1.09765
Training until validation scores don't improve for 50 rounds
[2]	train's rmse: 1.00337	valid's rmse: 1.02533
[3]	train's rmse: 0.939754	valid's rmse: 0.96051
[4]	train's rmse: 0.880569	valid's rmse: 0.900256
[5]	train's rmse: 0.831001	valid's rmse: 0.850012
[6]	train's rmse: 0.787256	valid's rmse: 0.805869
[7]	train's rmse: 0.749011	valid's rmse: 0.767588
[8]	train's rmse: 0.716512	valid's rmse: 0.735781
[9]	train's rmse: 0.688826	valid's rmse: 0.708487
[10]	train's rmse: 0.664918	valid's rmse: 0.685564
[11]	train's rmse: 0.642954	valid's rmse: 0.664714
[12]	train's rmse: 0.622934	valid's rmse: 0.645134
[13]	train's rmse: 0.605953	valid's rmse: 0.629209
[14]	train's rmse: 0.590942	valid's rmse: 0.615041
[1