## Baseline - Infer

https://www.kaggle.com/code/pxcai666/catboost-lightgbm-ensemble/notebook

### Configuration

In [1]:
import sys
from pathlib import Path
import subprocess
import os
import gc
from glob import glob
import pickle

import numpy as np
import pandas as pd
import polars as pl
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit, GroupKFold, StratifiedGroupKFold
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier, Pool
import lightgbm as lgb

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer



In [2]:
ROOT = "/kaggle/input/home-credit-credit-risk-model-stability"

TRAIN_DIR = os.path.join(ROOT, "parquet_files", "train")
TEST_DIR = os.path.join(ROOT, "parquet_files", "test")

In [3]:
device = "gpu"

### Def utility classes

In [4]:
from src.utils.utility import Utility
from src.utils.schema_gen import SchemaGen
from src.utils.pipeline import Pipeline
from src.utils.aggregator import Aggregator

### Read train data

In [5]:
%%time

data_store: dict = {
    "df_base": SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_base.parquet")),
    "depth_0": [
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_static_cb_0.parquet")),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_static_0_*.parquet")),
    ],
    "depth_1": [
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_applprev_1_*.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_tax_registry_a_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_tax_registry_b_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_tax_registry_c_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_a_1_*.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_b_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_other_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_person_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_deposit_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_debitcard_1.parquet"), 1),
    ],
    "depth_2": [
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_a_2_*.parquet"), 2),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_b_2.parquet"), 2),
    ],
}

train_df: pl.LazyFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(Pipeline.filter_cols)
    .pipe(Pipeline.transform_cols)
    .pipe(Pipeline.handle_dates)
    .pipe(Utility.reduce_memory_usage, "train_df")
)

del data_store
gc.collect()

print(f"Train data shape: {train_df.shape}")
display(train_df.head(10))

File train_base loaded into memory.
File train_static_cb_0 loaded into memory.
File train_static_0_1 loaded into memory.
File train_static_0_0 loaded into memory.
File train_applprev_1_0 loaded into memory.
File train_applprev_1_1 loaded into memory.
File train_tax_registry_a_1 loaded into memory.
File train_tax_registry_b_1 loaded into memory.
File train_tax_registry_c_1 loaded into memory.
File train_credit_bureau_a_1_3 loaded into memory.
File train_credit_bureau_a_1_1 loaded into memory.
File train_credit_bureau_a_1_0 loaded into memory.
File train_credit_bureau_a_1_2 loaded into memory.
File train_credit_bureau_b_1 loaded into memory.
File train_other_1 loaded into memory.
File train_person_1 loaded into memory.
File train_deposit_1 loaded into memory.
File train_debitcard_1 loaded into memory.
File train_credit_bureau_a_2_8 loaded into memory.
File train_credit_bureau_a_2_7 loaded into memory.
File train_credit_bureau_a_2_5 loaded into memory.
File train_credit_bureau_a_2_0 loade

case_id,month,week_num,target,assignmentdate_238D,assignmentdate_4527235D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,days120_123L,days180_256L,days30_165L,days360_512L,days90_310L,description_5085714M,education_1103M,education_88M,firstquarter_103L,fourthquarter_440L,maritalst_385M,maritalst_893M,numberofqueries_373L,pmtaverage_3A,pmtaverage_4527227A,pmtcount_4527229L,pmtcount_693L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,responsedate_1012D,responsedate_4527233D,responsedate_4917613D,secondquarter_766L,thirdquarter_1082L,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,…,mean_mainoccupationinc_384A,max_amount_416A,max_num_group1_10,max_openingdate_313D,mean_amount_416A,mean_openingdate_313D,max_num_group1_11,max_openingdate_857D,mean_openingdate_857D,max_collater_typofvalofguarant_298M,max_collater_typofvalofguarant_407M,max_collater_valueofguarantee_1124L,max_collater_valueofguarantee_876L,max_collaterals_typeofguarante_359M,max_collaterals_typeofguarante_669M,max_num_group1_12,max_num_group2,max_pmts_dpd_1073P,max_pmts_dpd_303P,max_pmts_month_158T,max_pmts_month_706T,max_pmts_overdue_1140A,max_pmts_overdue_1152A,max_pmts_year_1139T,max_pmts_year_507T,max_subjectroles_name_541M,max_subjectroles_name_838M,mean_pmts_dpd_1073P,mean_pmts_dpd_303P,mean_pmts_overdue_1140A,mean_pmts_overdue_1152A,var_pmts_dpd_1073P,var_pmts_dpd_303P,var_pmts_overdue_1140A,var_pmts_overdue_1152A,year,day
u32,u32,u8,u8,i16,u8,i16,f32,i32,f32,f32,f32,f32,f32,str,str,str,f32,f32,str,str,f32,f32,f32,f32,f32,f32,f32,str,i8,u8,i8,f32,f32,f32,f32,f32,…,f32,f32,u8,i16,f32,i16,u8,i16,i16,str,str,f32,f32,str,str,u16,u8,f32,f32,f32,f32,f32,f32,f32,f32,str,str,f32,f32,f32,f32,f32,f32,f32,f32,u16,u8
1622258,201911,44,0,,,,,-14345,0.0,0.0,0.0,1.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",2.0,0.0,"""a55475b1""","""a55475b1""",1.0,,,,,,,"""DEDUCTION_6""",,14.0,,1.0,0.0,0.0,12582.0,1671.400024,…,40000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,144300.0,"""c7a5ad39""","""c7a5ad39""",11,35,32.0,27.0,12.0,12.0,771.247986,6923.57959,2020.0,2019.0,"""ab3c25cf""","""ab3c25cf""",3.216216,0.314815,78.443245,127.621788,68.45195,5.049345,41972.652344,631622.375,2019,10
1792392,202002,59,0,,,,,-9881,3.0,5.0,0.0,10.0,1.0,"""a55475b1""","""a55475b1""","""a55475b1""",5.0,5.0,"""a7fcb6e5""","""a55475b1""",10.0,,,,,,,"""DEDUCTION_6""",,,,5.0,4.0,0.0,43298.816406,2430.800049,…,30000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",2,35,1.0,0.0,12.0,12.0,1868.0,0.0,2021.0,2020.0,"""ab3c25cf""","""ab3c25cf""",0.021739,0.0,40.608696,0.0,0.021739,0.0,75857.046875,0.0,2020,21
1803542,202003,60,0,,,,,-13394,0.0,0.0,0.0,1.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,0.0,"""3439d993""","""a55475b1""",1.0,,,,,,,"""DEDUCTION_6""",,14.0,,4.0,1.0,0.0,107923.804688,7691.200195,…,60000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,2066000.0,"""c7a5ad39""","""c7a5ad39""",8,35,27.0,23.0,12.0,12.0,9277.0,5932.399902,2021.0,2019.0,"""ab3c25cf""","""ab3c25cf""",0.980392,0.808333,589.07843,335.808411,17.059608,12.37472,4229014.0,1830200.0,2020,2
903928,201912,50,0,,,,,-10092,6.0,11.0,1.0,20.0,3.0,"""a55475b1""","""a55475b1""","""a55475b1""",5.0,2.0,"""a7fcb6e5""","""a55475b1""",20.0,,,,,,,"""DEDUCTION_6""",,14.0,,5.0,11.0,0.0,,7632.800293,…,200000.0,,,,,,,,,"""a55475b1""","""a55475b1""",11401000.0,0.0,"""c7a5ad39""","""c7a5ad39""",4,35,0.0,872.0,12.0,12.0,0.0,68764.125,2020.0,2020.0,"""ab3c25cf""","""ab3c25cf""",0.0,225.432831,0.0,17660.828125,0.0,95010.429688,0.0,602728192.0,2019,18
1296781,201903,8,0,,,-16860.0,,-16860,3.0,7.0,0.0,9.0,2.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",4.0,6.0,"""3439d993""","""a55475b1""",9.0,,,,,6.0,15091.400391,,14.0,,,2.0,2.0,0.0,,3376.199951,…,59400.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,,"""a55475b1""","""c7a5ad39""",1,23,29.0,,12.0,,12270.799805,,2020.0,,"""a55475b1""","""ab3c25cf""",5.588235,,5073.619629,,69.882355,,20693872.0,,2019,1
1262572,201901,3,0,,,-12381.0,,-12381,1.0,2.0,0.0,4.0,1.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",2.0,1.0,"""3439d993""","""a55475b1""",4.0,,,,,6.0,8516.666016,,14.0,,,3.0,4.0,0.0,,4928.800293,…,40000.0,,,,,,,,,"""a55475b1""","""a55475b1""",5129000.0,,"""a55475b1""","""c7a5ad39""",2,23,0.0,,12.0,,0.0,,2020.0,,"""a55475b1""","""daf49a8a""",0.0,,0.0,,0.0,,0.0,,2019,23
135943,201906,23,0,,,-15081.0,,-15081,0.0,1.0,0.0,1.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,1.0,"""a55475b1""","""a55475b1""",1.0,,,,,6.0,14677.970703,,14.0,,,0.0,0.0,0.0,12916.400391,600.0,…,14000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,,"""a55475b1""","""c7a5ad39""",1,35,0.0,,12.0,,0.0,,2020.0,,"""a55475b1""","""ab3c25cf""",0.0,,0.0,,0.0,,0.0,,2019,15
1287742,201902,7,0,,,-9851.0,,-9851,7.0,8.0,1.0,13.0,4.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",7.0,7.0,"""a7fcb6e5""","""a55475b1""",13.0,,,,,12.0,58805.0,,14.0,,,5.0,5.0,0.0,,7521.200195,…,200000.0,,,,,,,,,"""a55475b1""","""a55475b1""",28642782.0,,"""a55475b1""","""c7a5ad39""",1,35,1.0,,12.0,,45323.089844,,2020.0,,"""a55475b1""","""ab3c25cf""",0.08,,3623.362305,,0.076667,,157271616.0,,2019,19
1428642,201907,26,0,,,-21123.0,,-21123,1.0,1.0,1.0,3.0,1.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,1.0,"""a55475b1""","""a55475b1""",3.0,,,,,8.0,7918.600098,,14.0,,,1.0,4.0,0.0,0.0,10091.200195,…,56000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,90000.0,"""c7a5ad39""","""c7a5ad39""",6,35,0.0,0.0,12.0,12.0,7762.484375,0.0,2020.0,2017.0,"""ab3c25cf""","""ab3c25cf""",0.0,0.0,323.436859,0.0,0.0,0.0,2510673.5,0.0,2019,2
219563,202003,63,0,,,,,-13444,8.0,8.0,1.0,10.0,8.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",9.0,0.0,"""3439d993""","""a55475b1""",10.0,,,,,,,"""DEDUCTION_6""",,14.0,,1.0,6.0,0.0,10890.570312,5726.399902,…,70000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",6,35,33.0,64.0,12.0,12.0,13533.0,13927.200195,2021.0,2019.0,"""ab3c25cf""","""ab3c25cf""",1.354167,6.690266,994.868469,1466.391479,37.04211,227.983566,10297029.0,9969545.0,2020,22


CPU times: user 2min 48s, sys: 34.2 s, total: 3min 22s
Wall time: 18.2 s


### Read test data

In [6]:
data_store = {
    "df_base": SchemaGen.scan_files(os.path.join(TEST_DIR, "test_base.parquet")),
    "depth_0": [
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_static_cb_0.parquet")),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_static_0_*.parquet")),
    ],
    "depth_1": [
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_applprev_1_*.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_tax_registry_a_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_tax_registry_b_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_tax_registry_c_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_credit_bureau_a_1_*.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_credit_bureau_b_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_other_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_person_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_deposit_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_debitcard_1.parquet"), 1),
    ],
    "depth_2": [
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_credit_bureau_b_2.parquet"), 2),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_credit_bureau_a_2_*.parquet"), 2),
        # SchemaGen.scan_files(os.path.join(TEST_DIR, "test_applprev_2.parquet"), 2),
        # SchemaGen.scan_files(os.path.join(TEST_DIR, "test_person_2.parquet"), 2)
    ],
}

test_df: pl.DataFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(Pipeline.transform_cols)
    .pipe(Pipeline.handle_dates)
    .select([col for col in train_df.columns if col != "target"])
    .pipe(Utility.reduce_memory_usage, "test_df")
)

del data_store
gc.collect()

print(f"Test data shape: {test_df.shape}")

File test_base loaded into memory.
File test_static_cb_0 loaded into memory.
File test_static_0_1 loaded into memory.
File test_static_0_2 loaded into memory.
File test_static_0_0 loaded into memory.
File test_applprev_1_0 loaded into memory.
File test_applprev_1_1 loaded into memory.
File test_applprev_1_2 loaded into memory.
File test_tax_registry_a_1 loaded into memory.
File test_tax_registry_b_1 loaded into memory.
File test_tax_registry_c_1 loaded into memory.
File test_credit_bureau_a_1_3 loaded into memory.
File test_credit_bureau_a_1_2 loaded into memory.
File test_credit_bureau_a_1_0 loaded into memory.
File test_credit_bureau_a_1_4 loaded into memory.
File test_credit_bureau_a_1_1 loaded into memory.
File test_credit_bureau_b_1 loaded into memory.
File test_other_1 loaded into memory.
File test_person_1 loaded into memory.
File test_deposit_1 loaded into memory.
File test_debitcard_1 loaded into memory.
File test_credit_bureau_b_2 loaded into memory.
File test_credit_bureau_a

In [7]:
train_df, cat_cols = Utility.to_pandas(train_df)
test_df, cat_cols = Utility.to_pandas(test_df, cat_cols)

### Inference

In [8]:
with open('voting_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

In [9]:
test_df = test_df.drop(columns=["week_num"]).set_index("case_id")
test_df[cat_cols] = test_df[cat_cols].astype("category")

preds = pd.Series(loaded_model.predict_proba(test_df)[:, 1], index=test_df.index)

submission_df = pd.DataFrame(index=test_df.index)
submission_df["score"] = preds

submission_df

Unnamed: 0_level_0,score
case_id,Unnamed: 1_level_1
57543,0.298113
57630,0.035641
57631,0.057045
57633,0.045478
57634,0.094021
57552,0.043232
57549,0.136894
57551,0.099039
57569,0.033847
57632,0.035788


In [10]:
submission_df.to_csv("submission.csv")