In [22]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from collections import defaultdict, namedtuple
from tqdm.auto import tqdm

sns.set()

# Libraries

In [3]:
import catboost
import xgboost
import lightgbm

# Datasets

In [5]:
from catboost.datasets import (
    adult,
    amazon,
    higgs
)

In [178]:
DataSource = namedtuple('DataSource', ['name', 'source', 'target'])
datasets = [
    DataSource('adult', adult(), 'income'),
    DataSource('higgs', higgs(), 0),
]

catboost_params = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'iterations': 100,
    'learning_rate': 0.03,
    'random_seed': 42,
    'l2_leaf_reg': 3.0,
    'depth': 6,
    'max_leaves': 31,
    'thread_count': 5
}
xgboost_params = {}
lightgbm_params = {
    'objective': 'cross_entropy',
    'num_iterations': 100,
    'learning_rate': 0.03,
    'num_leaves': 31,
    'max_depth': 6,
    'lambda_l2': 3.0,
    'num_threads': 5,
    'seed': 42
}

In [56]:
from sklearn.metrics import roc_auc_score

# Catboost

In [150]:
def prepare_pools(dataset):
    train_df, test_df = dataset.source
    train_df.fillna(-999, inplace=True)
    test_df.fillna(-999, inplace=True)

    if dataset.name == 'adult':
        train_df[dataset.target] = train_df[dataset.target].map({'<=50K': 0, '>50K': 1}, na_action='ignore')

    cat_features = [
        col for col in train_df.columns[train_df.dtypes == object]
        if col != dataset.target
    ]
    train_pool = catboost.Pool(train_df.drop(columns=[dataset.target]), train_df[dataset.target], cat_features=cat_features)
    test_pool = catboost.Pool(test_df.drop(columns=[dataset.target]), test_df[dataset.target], cat_features=cat_features)
    
    return train_pool, test_pool

In [106]:
catboost_results = defaultdict(list)

for dataset in tqdm(datasets):
    train_pool, test_pool = prepare_pools(dataset)
    
    cv_results, cv_models = catboost.cv(
        train_pool,
        catboost_params,
        nfold=5,
        partition_random_seed=42,
        verbose=False,
        return_models=True
    )

    single_model = catboost.CatBoost(catboost_params).fit(train_pool, verbose=False)
    single_model_prediction = single_model.predict(test_pool)
    cv_models_prediction = [model.predict(test_pool) for model in cv_models]
    test_df = dataset.source[1]
    catboost_results['name'].append(dataset.name)
    catboost_results['cv_roc_auc_ensemble'].append(roc_auc_score(
        test_df[dataset.target], np.mean(cv_models_prediction, axis=0)
    ))
    catboost_results['cv_roc_auc_mean'].append(
        np.mean([roc_auc_score(test_df[dataset.target], prediction)
                for prediction in cv_models_prediction], 
                axis=0)
    )
    catboost_results['cv_roc_auc_std'].append(
        np.std([roc_auc_score(test_df[dataset.target], prediction)
                for prediction in cv_models_prediction], 
                axis=0)
    )
    catboost_results['single_roc_auc'].append(
        roc_auc_score(test_df[dataset.target], single_model_prediction)
    )

  0%|          | 0/1 [00:00<?, ?it/s]

In [107]:
pd.DataFrame(catboost_results)

Unnamed: 0,name,cv_roc_auc_ensemble,cv_roc_auc_mean,cv_roc_auc_std,single_roc_auc
0,adult,0.916009,0.915398,0.000537,0.911577
1,higgs,0.778608,0.778434,0.000271,0.778236


# Lightgbm

In [134]:
from sklearn.preprocessing import LabelEncoder

In [179]:
def prepare_lightgbm_dataset(dataset):
    train_df, test_df = dataset.source

    if dataset.name == 'adult':
        train_df[dataset.target] = train_df[dataset.target].map({'<=50K': 0, '>50K': 1}, na_action='ignore')

    cat_features = [
        col for col in train_df.columns[train_df.dtypes == object]
        if col != dataset.target
    ]
    encoders = [LabelEncoder() for _ in cat_features]
    
    for feature, encoder in zip(cat_features, encoders):
        train_df[feature] = encoder.fit_transform(train_df[feature])
        test_df[feature] = encoder.transform(test_df[feature])
    train_df.fillna(-999, inplace=True)
    test_df.fillna(-999, inplace=True)
    
    cat_features = [feature for feature in cat_features if feature != dataset.target]
    
    train_pool = lightgbm.Dataset(train_df.drop(columns=[dataset.target]), train_df[dataset.target], 
                                  categorical_feature=cat_features, free_raw_data=False)
    test_pool = lightgbm.Dataset(test_df.drop(columns=[dataset.target]), test_df[dataset.target], 
                                 categorical_feature=cat_features, free_raw_data=False)
    
    return train_pool, test_pool, cat_features

In [180]:
lightgbm_results = defaultdict(list)

for dataset in tqdm(datasets):
    train_pool, test_pool, cat_features = prepare_lightgbm_dataset(dataset)
    
    cv_results = lightgbm.cv(
        lightgbm_params,
        train_pool,
        nfold=5,
        seed=42,
        verbose_eval=False,
        return_cvbooster=True
    )
    
    cv_models = cv_results['cvbooster'].boosters

    single_model = lightgbm.train(lightgbm_params, train_pool)
    test_df = dataset.source[1]
    single_model_prediction = single_model.predict(test_df.drop(columns=[dataset.target]), categorical_feature=cat_features)
    cv_models_prediction = [model.predict(test_df.drop(columns=[dataset.target])) for model in cv_models]
    lightgbm_results['name'].append(dataset.name)
    lightgbm_results['cv_roc_auc_ensemble'].append(roc_auc_score(
        test_df[dataset.target], np.mean(cv_models_prediction, axis=0)
    ))
    lightgbm_results['cv_roc_auc_mean'].append(
        np.mean([roc_auc_score(test_df[dataset.target], prediction)
                for prediction in cv_models_prediction], 
                axis=0)
    )
    lightgbm_results['cv_roc_auc_std'].append(
        np.std([roc_auc_score(test_df[dataset.target], prediction)
                for prediction in cv_models_prediction], 
                axis=0)
    )
    lightgbm_results['single_roc_auc'].append(
        roc_auc_score(test_df[dataset.target], single_model_prediction)
    )

  0%|          | 0/2 [00:00<?, ?it/s]



[LightGBM] [Info] [cross_entropy:Init]: (objective) labels passed interval [0, 1] check
[LightGBM] [Info] [cross_entropy:Init]: (metric) labels passed interval [0, 1] check
[LightGBM] [Info] [cross_entropy:Init]: sum-of-weights = 26048.000000
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 705
[LightGBM] [Info] Number of data points in the train set: 26048, number of used features: 14
[LightGBM] [Info] [cross_entropy:Init]: (metric) labels passed interval [0, 1] check
[LightGBM] [Info] [cross_entropy:Init]: sum-of-weights = 6513.000000
[LightGBM] [Info] [cross_entropy:Init]: (objective) labels passed interval [0, 1] check
[LightGBM] [Info] [cross_entropy:Init]: (metric) labels passed interval [0, 1] check
[LightGBM] [Info] [cross_entropy:Init]: sum-of-weights = 26049.000000
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wi



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 705
[LightGBM] [Info] Number of data points in the train set: 32561, number of used features: 14
[LightGBM] [Info] [cross_entropy:BoostFromScore]: pavg = 0.240810 -> initscore = -1.148246
[LightGBM] [Info] Start training from score -1.148246




[LightGBM] [Info] [cross_entropy:Init]: (objective) labels passed interval [0, 1] check
[LightGBM] [Info] [cross_entropy:Init]: (metric) labels passed interval [0, 1] check
[LightGBM] [Info] [cross_entropy:Init]: sum-of-weights = 8400000.000000
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6132
[LightGBM] [Info] Number of data points in the train set: 8400000, number of used features: 28
[LightGBM] [Info] [cross_entropy:Init]: (metric) labels passed interval [0, 1] check
[LightGBM] [Info] [cross_entropy:Init]: sum-of-weights = 2100000.000000
[LightGBM] [Info] [cross_entropy:Init]: (objective) labels passed interval [0, 1] check
[LightGBM] [Info] [cross_entropy:Init]: (metric) labels passed interval [0, 1] check
[LightGBM] [Info] [cross_entropy:Init]: sum-of-weights = 8400000.000000
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `fo



[LightGBM] [Info] [cross_entropy:Init]: (objective) labels passed interval [0, 1] check
[LightGBM] [Info] [cross_entropy:Init]: (metric) labels passed interval [0, 1] check
[LightGBM] [Info] [cross_entropy:Init]: sum-of-weights = 10500000.000000
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6132
[LightGBM] [Info] Number of data points in the train set: 10500000, number of used features: 28
[LightGBM] [Info] [cross_entropy:BoostFromScore]: pavg = 0.529963 -> initscore = 0.119997
[LightGBM] [Info] Start training from score 0.119997


In [181]:
pd.DataFrame(lightgbm_results)

Unnamed: 0,name,cv_roc_auc_ensemble,cv_roc_auc_mean,cv_roc_auc_std,single_roc_auc
0,adult,0.919005,0.918514,0.000467,0.918703
1,higgs,0.792647,0.7925,9.9e-05,0.792469
