come from:https://www.kaggle.com/code/majiaqi111/home-credit-lgb-cat-ensemble

&

https://www.kaggle.com/code/tamnguyen280/home-credit-lgb-cat-xgb-ensemble

In [1]:
import sys
from pathlib import Path
import subprocess
import os
import gc
from glob import glob

import numpy as np
import pandas as pd
import polars as pl
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

ROOT = '/kaggle/input/home-credit-credit-risk-model-stability'



In [2]:
from sklearn.model_selection import TimeSeriesSplit, GroupKFold, StratifiedGroupKFold
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer

In [3]:
class Pipeline:

    def set_table_dtypes(df):
        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int64))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
            elif col[-1] in ("D",):
                df = df.with_columns(pl.col(col).cast(pl.Date))
        return df

    def handle_dates(df):
        for col in df.columns:
            if col[-1] in ("D",):
                df = df.with_columns(pl.col(col) - pl.col("date_decision"))  #!!?
                df = df.with_columns(pl.col(col).dt.total_days()) # t - t-1
        df = df.drop("date_decision", "MONTH")
        return df

    def filter_cols(df):
        for col in df.columns:
            if col not in ["target", "case_id", "WEEK_NUM"]:
                isnull = df[col].is_null().mean()
                if isnull > 0.7:
                    df = df.drop(col)
        
        for col in df.columns:
            if (col not in ["target", "case_id", "WEEK_NUM"]) & (df[col].dtype == pl.String):
                freq = df[col].n_unique()
                if (freq == 1) | (freq > 200):
                    df = df.drop(col)
        
        return df


In [4]:
class Aggregator:
    #Please add or subtract features yourself, be aware that too many features will take up too much space.
    def num_expr(df):
        cols = [col for col in df.columns if col[-1] in ("P", "A")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        #expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
        return expr_max +expr_last+expr_mean
    
    def date_expr(df):
        cols = [col for col in df.columns if col[-1] in ("D")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        #expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        #expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
        return  expr_max +expr_last+expr_mean
    
    def str_expr(df):
        cols = [col for col in df.columns if col[-1] in ("M",)]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        #expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        #expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        #expr_count = [pl.count(col).alias(f"count_{col}") for col in cols]
        return  expr_max +expr_last#+expr_count
    
    def other_expr(df):
        cols = [col for col in df.columns if col[-1] in ("T", "L")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        #expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        #expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        return  expr_max +expr_last
    
    def count_expr(df):
        cols = [col for col in df.columns if "num_group" in col]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols] 
        #expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        #expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        return  expr_max +expr_last
    
    def get_exprs(df):
        exprs = Aggregator.num_expr(df) + \
                Aggregator.date_expr(df) + \
                Aggregator.str_expr(df) + \
                Aggregator.other_expr(df) + \
                Aggregator.count_expr(df)

        return exprs

def read_file(path, depth=None):
    df = pl.read_parquet(path)
    df = df.pipe(Pipeline.set_table_dtypes)
    if depth in [1,2]:
        df = df.group_by("case_id").agg(Aggregator.get_exprs(df)) 
    return df

def read_files(regex_path, depth=None):
    chunks = []
    
    for path in glob(str(regex_path)):
        df = pl.read_parquet(path)
        df = df.pipe(Pipeline.set_table_dtypes)
        if depth in [1, 2]:
            df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
        chunks.append(df)
    
    df = pl.concat(chunks, how="vertical_relaxed")
    df = df.unique(subset=["case_id"])
    return df

def feature_eng(df_base, depth_0, depth_1, depth_2):
    df_base = (
        df_base
        .with_columns(
            month_decision = pl.col("date_decision").dt.month(),
            weekday_decision = pl.col("date_decision").dt.weekday(),
        )
    )
    for i, df in enumerate(depth_0 + depth_1 + depth_2):
        df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
    df_base = df_base.pipe(Pipeline.handle_dates)
    return df_base



In [5]:
def to_pandas(df_data, cat_cols=None):
    df_data = df_data.to_pandas()
    if cat_cols is None:
        cat_cols = list(df_data.select_dtypes("object").columns)
    df_data[cat_cols] = df_data[cat_cols].astype("category")
    return df_data, cat_cols


In [6]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        if str(col_type)=="category":
            continue
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            continue
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [7]:
ROOT            = Path("/kaggle/input/home-credit-credit-risk-model-stability")

TRAIN_DIR       = ROOT / "parquet_files" / "train"
TEST_DIR        = ROOT / "parquet_files" / "test"

In [8]:
data_store = {
    "df_base": read_file(TRAIN_DIR / "train_base.parquet"),
    "depth_0": [
        read_file(TRAIN_DIR / "train_static_cb_0.parquet"),
        read_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_c_1.parquet", 1),
        read_files(TRAIN_DIR / "train_credit_bureau_a_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_other_1.parquet", 1),
        read_file(TRAIN_DIR / "train_person_1.parquet", 1),
        read_file(TRAIN_DIR / "train_deposit_1.parquet", 1),
        read_file(TRAIN_DIR / "train_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
        read_files(TRAIN_DIR / "train_credit_bureau_a_2_*.parquet", 2),
        read_file(TRAIN_DIR / "train_applprev_2.parquet", 2),
        read_file(TRAIN_DIR / "train_person_2.parquet", 2)
    ]
}

In [9]:
df_train = feature_eng(**data_store)
print("train data shape:\t", df_train.shape)
del data_store
gc.collect()
df_train = df_train.pipe(Pipeline.filter_cols)
df_train, cat_cols = to_pandas(df_train)
df_train = reduce_mem_usage(df_train)
print("train data shape:\t", df_train.shape)
nums=df_train.select_dtypes(exclude='category').columns
from itertools import combinations, permutations
#df_train=df_train[nums]
nans_df = df_train[nums].isna()
nans_groups={}
for col in nums:
    cur_group = nans_df[col].sum()
    try:
        nans_groups[cur_group].append(col)
    except:
        nans_groups[cur_group]=[col]
del nans_df; x=gc.collect()

def reduce_group(grps):
    use = []
    for g in grps:
        mx = 0; vx = g[0]
        for gg in g:
            n = df_train[gg].nunique()
            if n>mx:
                mx = n
                vx = gg
            #print(str(gg)+'-'+str(n),', ',end='')
        use.append(vx)
        #print()
    print('Use these',use)
    return use

def group_columns_by_correlation(matrix, threshold=0.8):
    # 计算列之间的相关性
    correlation_matrix = matrix.corr()

    # 分组列
    groups = []
    remaining_cols = list(matrix.columns)
    while remaining_cols:
        col = remaining_cols.pop(0)
        group = [col]
        correlated_cols = [col]
        for c in remaining_cols:
            if correlation_matrix.loc[col, c] >= threshold:
                group.append(c)
                correlated_cols.append(c)
        groups.append(group)
        remaining_cols = [c for c in remaining_cols if c not in correlated_cols]
    
    return groups

uses=[]
for k,v in nans_groups.items():
    if len(v)>1:
            Vs = nans_groups[k]
            #cross_features=list(combinations(Vs, 2))
            #make_corr(Vs)
            grps= group_columns_by_correlation(df_train[Vs], threshold=0.8)
            use=reduce_group(grps)
            uses=uses+use
            #make_corr(use)
    else:
        uses=uses+v
    print('####### NAN count =',k)
print(uses)
print(len(uses))
uses=uses+list(df_train.select_dtypes(include='category').columns)
print(len(uses))
df_train=df_train[uses]

train data shape:	 (1526659, 861)
Memory usage of dataframe is 4322.75 MB
Memory usage after optimization is: 1528.81 MB
Decreased by 64.6%
train data shape:	 (1526659, 472)
Use these ['case_id', 'WEEK_NUM', 'target', 'month_decision', 'weekday_decision', 'credamount_770A', 'applicationcnt_361L', 'applications30d_658L', 'applicationscnt_1086L', 'applicationscnt_464L', 'applicationscnt_867L', 'clientscnt_1022L', 'clientscnt_100L', 'clientscnt_1071L', 'clientscnt_1130L', 'clientscnt_157L', 'clientscnt_257L', 'clientscnt_304L', 'clientscnt_360L', 'clientscnt_493L', 'clientscnt_533L', 'clientscnt_887L', 'clientscnt_946L', 'deferredmnthsnum_166L', 'disbursedcredamount_1113A', 'downpmt_116A', 'homephncnt_628L', 'isbidproduct_1095L', 'mobilephncnt_593L', 'numactivecreds_622L', 'numactivecredschannel_414L', 'numactiverelcontr_750L', 'numcontrs3months_479L', 'numnotactivated_1143L', 'numpmtchanneldd_318L', 'numrejects9m_859L', 'sellerplacecnt_915L', 'max_mainoccupationinc_384A', 'max_birth_259D

In [10]:
sample = pd.read_csv("/kaggle/input/home-credit-credit-risk-model-stability/sample_submission.csv")
device='gpu'
# n_samples=200000
n_est=6000
DRY_RUN = True if sample.shape[0] == 10 else False   
if DRY_RUN:
    device='cpu'
    df_train = df_train.iloc[:50000]
    # n_samples=10000
    # n_est=2400
print(device)

cpu


In [11]:
data_store = {
    "df_base": read_file(TEST_DIR / "test_base.parquet"),
    "depth_0": [
        read_file(TEST_DIR / "test_static_cb_0.parquet"),
        read_files(TEST_DIR / "test_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TEST_DIR / "test_applprev_1_*.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_a_1.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_b_1.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_c_1.parquet", 1),
        read_files(TEST_DIR / "test_credit_bureau_a_1_*.parquet", 1),
        read_file(TEST_DIR / "test_credit_bureau_b_1.parquet", 1),
        read_file(TEST_DIR / "test_other_1.parquet", 1),
        read_file(TEST_DIR / "test_person_1.parquet", 1),
        read_file(TEST_DIR / "test_deposit_1.parquet", 1),
        read_file(TEST_DIR / "test_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TEST_DIR / "test_credit_bureau_b_2.parquet", 2),
        read_files(TEST_DIR / "test_credit_bureau_a_2_*.parquet", 2),
        read_file(TEST_DIR / "test_applprev_2.parquet", 2),
        read_file(TEST_DIR / "test_person_2.parquet", 2)
    ]
}

In [12]:
df_test = feature_eng(**data_store)
print("test data shape:\t", df_test.shape)
del data_store
gc.collect()
df_test = df_test.select([col for col in df_train.columns if col != "target"])
print("train data shape:\t", df_train.shape)
print("test data shape:\t", df_test.shape)

df_test, cat_cols = to_pandas(df_test, cat_cols)
df_test = reduce_mem_usage(df_test)

gc.collect()

test data shape:	 (10, 860)
train data shape:	 (50000, 389)
test data shape:	 (10, 388)
Memory usage of dataframe is 0.04 MB
Memory usage after optimization is: 0.02 MB
Decreased by 40.3%


0

### Feature Selection

In [13]:
y = df_train["target"]
weeks = df_train["WEEK_NUM"]
df_train= df_train.drop(columns=["target", "case_id", "WEEK_NUM"])
cv = StratifiedGroupKFold(n_splits=5, shuffle=False)


In [14]:
df_train[cat_cols] = df_train[cat_cols].astype(str)
df_test[cat_cols] = df_test[cat_cols].astype(str)

In [15]:
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 15,  
    "learning_rate": 0.03,
    "n_estimators": 3400,  
    "colsample_bytree": 0.6,
    "colsample_bynode": 0.6,
    "verbose": -1,
    "random_state": 42,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "extra_trees":True,
    'num_leaves':32,
    "device": device, 
    "verbose": -1,
}

In [16]:
params2 = {
    "booster": "gbtree",
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "n_estimators" : 2000,
    "bagging_fraction" : 0.9,
    "feature_fraction" : 0.8,
    "gamma" : 0.7,
    "min_child_samples" : 120,
    "num_leaves" : 220,
    "max_depth": 17,
    "learning_rate": 0.01,
    "colsample_bytree": 0.6,
    "colsample_bynode": 0.6,
    "alpha": 0.4,  
    "lambda": 0.2,  
    "subsample" : 0.7,
    "tree_method": 'gpu_hist' if device == 'gpu' else 'auto',
    "random_state": 42,
    "verbosity": 0,
    "enable_categorical":True,
}

In [17]:
from catboost import CatBoostClassifier, Pool
import xgboost as xgb

fitted_models_cat = []
fitted_models_lgb = []
fitted_models_xgb = []

cv_scores_cat = []
cv_scores_lgb = []
cv_scores_xgb = []


for idx_train, idx_valid in cv.split(df_train, y, groups=weeks):
    X_train, y_train = df_train.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = df_train.iloc[idx_valid], y.iloc[idx_valid]
    train_pool = Pool(X_train, y_train,cat_features=cat_cols)
    val_pool = Pool(X_valid, y_valid,cat_features=cat_cols)
    clf = CatBoostClassifier(
    eval_metric='AUC',
    task_type='GPU',
    learning_rate=0.03,
    iterations=n_est)
    random_seed=3107
    clf.fit(train_pool, eval_set=val_pool,verbose=300)
    fitted_models_cat.append(clf)
    y_pred_valid = clf.predict_proba(X_valid)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores_cat.append(auc_score)
    
    
    X_train[cat_cols] = X_train[cat_cols].astype("category")
    X_valid[cat_cols] = X_valid[cat_cols].astype("category")
    
    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train, y_train,
        eval_set = [(X_valid, y_valid)],
        callbacks = [lgb.log_evaluation(200), lgb.early_stopping(100)] )
    
    fitted_models_lgb.append(model)
    y_pred_valid = model.predict_proba(X_valid)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores_lgb.append(auc_score)
    
    model2 = xgb.XGBClassifier(**params2)
    model2.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        early_stopping_rounds=100, verbose=False)
    
    fitted_models_xgb.append(model2)
    
    y_pred_valid = model2.predict_proba(X_valid)[:, 1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores_xgb.append(auc_score)
    
    
    
    
print("CV AUC scores: ", cv_scores_cat)
print("Maximum CV AUC score: ", max(cv_scores_cat))


print("CV AUC scores: ", cv_scores_lgb)
print("Maximum CV AUC score: ", max(cv_scores_lgb))

print("CV AUC scores: ", cv_scores_xgb)
print("Maximum CV AUC score: ", max(cv_scores_xgb))

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5595068	best: 0.5595068 (0)	total: 271ms	remaining: 27m 7s
300:	test: 0.7363657	best: 0.7366523 (290)	total: 28.7s	remaining: 9m 3s
600:	test: 0.7408520	best: 0.7409019 (595)	total: 57.3s	remaining: 8m 34s
900:	test: 0.7423584	best: 0.7429192 (845)	total: 1m 25s	remaining: 8m 4s
1200:	test: 0.7429261	best: 0.7431214 (1045)	total: 1m 54s	remaining: 7m 36s
1500:	test: 0.7406867	best: 0.7431214 (1045)	total: 2m 22s	remaining: 7m 8s
1800:	test: 0.7382868	best: 0.7431214 (1045)	total: 2m 51s	remaining: 6m 40s
2100:	test: 0.7377582	best: 0.7431214 (1045)	total: 3m 20s	remaining: 6m 11s
2400:	test: 0.7370723	best: 0.7431214 (1045)	total: 3m 49s	remaining: 5m 43s
2700:	test: 0.7356693	best: 0.7431214 (1045)	total: 4m 17s	remaining: 5m 15s
3000:	test: 0.7346829	best: 0.7431214 (1045)	total: 4m 46s	remaining: 4m 46s
3300:	test: 0.7334839	best: 0.7431214 (1045)	total: 5m 16s	remaining: 4m 18s
3600:	test: 0.7325352	best: 0.7431214 (1045)	total: 5m 45s	remaining: 3m 50s
3900:	test: 0.732

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5960909	best: 0.5960909 (0)	total: 168ms	remaining: 16m 48s
300:	test: 0.7333127	best: 0.7334326 (285)	total: 29.4s	remaining: 9m 15s
600:	test: 0.7391424	best: 0.7394601 (575)	total: 58.8s	remaining: 8m 48s
900:	test: 0.7408775	best: 0.7408775 (900)	total: 1m 28s	remaining: 8m 19s
1200:	test: 0.7401667	best: 0.7410901 (1045)	total: 1m 57s	remaining: 7m 50s
1500:	test: 0.7396535	best: 0.7410901 (1045)	total: 2m 27s	remaining: 7m 21s
1800:	test: 0.7403391	best: 0.7410901 (1045)	total: 2m 56s	remaining: 6m 51s
2100:	test: 0.7395929	best: 0.7410901 (1045)	total: 3m 26s	remaining: 6m 23s
2400:	test: 0.7381794	best: 0.7410901 (1045)	total: 3m 56s	remaining: 5m 53s
2700:	test: 0.7368171	best: 0.7410901 (1045)	total: 4m 25s	remaining: 5m 24s
3000:	test: 0.7360839	best: 0.7410901 (1045)	total: 4m 55s	remaining: 4m 55s
3300:	test: 0.7359520	best: 0.7410901 (1045)	total: 5m 24s	remaining: 4m 25s
3600:	test: 0.7348905	best: 0.7410901 (1045)	total: 5m 54s	remaining: 3m 56s
3900:	test: 0

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6031942	best: 0.6031942 (0)	total: 143ms	remaining: 14m 16s
300:	test: 0.7661075	best: 0.7661075 (300)	total: 29.4s	remaining: 9m 16s
600:	test: 0.7717116	best: 0.7717116 (600)	total: 58.3s	remaining: 8m 43s
900:	test: 0.7744167	best: 0.7749066 (860)	total: 1m 27s	remaining: 8m 17s
1200:	test: 0.7761962	best: 0.7763113 (1165)	total: 1m 57s	remaining: 7m 49s
1500:	test: 0.7782362	best: 0.7783002 (1490)	total: 2m 27s	remaining: 7m 20s
1800:	test: 0.7776411	best: 0.7784834 (1515)	total: 2m 56s	remaining: 6m 52s
2100:	test: 0.7775435	best: 0.7784834 (1515)	total: 3m 26s	remaining: 6m 23s
2400:	test: 0.7777656	best: 0.7784834 (1515)	total: 3m 56s	remaining: 5m 53s
2700:	test: 0.7777546	best: 0.7784834 (1515)	total: 4m 25s	remaining: 5m 24s
3000:	test: 0.7779122	best: 0.7784834 (1515)	total: 4m 55s	remaining: 4m 55s
3300:	test: 0.7773581	best: 0.7784834 (1515)	total: 5m 25s	remaining: 4m 26s
3600:	test: 0.7772951	best: 0.7784834 (1515)	total: 5m 55s	remaining: 3m 56s
3900:	test: 0

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5775366	best: 0.5775366 (0)	total: 137ms	remaining: 13m 44s
300:	test: 0.7207067	best: 0.7207067 (300)	total: 29.6s	remaining: 9m 19s
600:	test: 0.7270485	best: 0.7273276 (595)	total: 59.1s	remaining: 8m 50s
900:	test: 0.7305225	best: 0.7305360 (895)	total: 1m 28s	remaining: 8m 22s
1200:	test: 0.7294677	best: 0.7311893 (985)	total: 1m 58s	remaining: 7m 52s
1500:	test: 0.7307900	best: 0.7311893 (985)	total: 2m 27s	remaining: 7m 23s
1800:	test: 0.7314157	best: 0.7319365 (1690)	total: 2m 57s	remaining: 6m 53s
2100:	test: 0.7321855	best: 0.7324328 (2085)	total: 3m 26s	remaining: 6m 23s
2400:	test: 0.7314194	best: 0.7324328 (2085)	total: 3m 56s	remaining: 5m 54s
2700:	test: 0.7308815	best: 0.7324328 (2085)	total: 4m 26s	remaining: 5m 25s
3000:	test: 0.7291616	best: 0.7324328 (2085)	total: 4m 56s	remaining: 4m 56s
3300:	test: 0.7287647	best: 0.7324328 (2085)	total: 5m 26s	remaining: 4m 26s
3600:	test: 0.7278693	best: 0.7324328 (2085)	total: 5m 56s	remaining: 3m 57s
3900:	test: 0.7

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6058731	best: 0.6058731 (0)	total: 136ms	remaining: 13m 37s
300:	test: 0.7232507	best: 0.7238935 (245)	total: 29.1s	remaining: 9m 10s
600:	test: 0.7256140	best: 0.7276329 (505)	total: 58.2s	remaining: 8m 42s
900:	test: 0.7268507	best: 0.7276329 (505)	total: 1m 27s	remaining: 8m 13s
1200:	test: 0.7281826	best: 0.7283993 (1175)	total: 1m 56s	remaining: 7m 46s
1500:	test: 0.7286766	best: 0.7290377 (1410)	total: 2m 25s	remaining: 7m 16s
1800:	test: 0.7297472	best: 0.7297472 (1800)	total: 2m 54s	remaining: 6m 47s
2100:	test: 0.7297457	best: 0.7302805 (1870)	total: 3m 23s	remaining: 6m 18s
2400:	test: 0.7291283	best: 0.7302805 (1870)	total: 3m 53s	remaining: 5m 49s
2700:	test: 0.7285143	best: 0.7302805 (1870)	total: 4m 22s	remaining: 5m 20s
3000:	test: 0.7279227	best: 0.7302805 (1870)	total: 4m 52s	remaining: 4m 52s
3300:	test: 0.7265729	best: 0.7302805 (1870)	total: 5m 22s	remaining: 4m 23s
3600:	test: 0.7259925	best: 0.7302805 (1870)	total: 5m 52s	remaining: 3m 54s
3900:	test: 0

In [18]:
class VotingModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimators):
        super().__init__()
        self.estimators = estimators
        
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)
    
    def predict_proba(self, X):
        
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators[:5]]
        
        X[cat_cols] = X[cat_cols].astype("category")
        y_preds += [estimator.predict_proba(X) for estimator in self.estimators[5:]]
        
        return np.mean(y_preds, axis=0)

model = VotingModel(fitted_models_cat+fitted_models_lgb+fitted_models_xgb)

# Submision

In [19]:
df_test = df_test.drop(columns=["WEEK_NUM"])
df_test = df_test.set_index("case_id")


y_pred = pd.Series(model.predict_proba(df_test)[:, 1], index=df_test.index)
df_subm = pd.read_csv(ROOT / "sample_submission.csv")
df_subm = df_subm.set_index("case_id")

df_subm["score"] = y_pred
df_subm.to_csv("submission.csv")
df_subm

Unnamed: 0_level_0,score
case_id,Unnamed: 1_level_1
57543,0.02103
57549,0.037331
57551,0.020781
57552,0.0489
57569,0.056869
57630,0.027634
57631,0.052421
57632,0.048571
57633,0.034016
57634,0.058846
