## Baseline - Infer

※ Kaggle notebookのみで動かす

https://www.kaggle.com/code/pxcai666/catboost-lightgbm-ensemble/notebook

### Configuration

In [1]:
import sys
from pathlib import Path
import subprocess
import os
import gc
from glob import glob
import pickle

import numpy as np
import pandas as pd
import polars as pl
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit, GroupKFold, StratifiedGroupKFold
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier, Pool
import lightgbm as lgb

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer

In [2]:
# Kaggle
sys.path.append('/kaggle/input/home-credit-crms-repo')

In [3]:
ROOT = "/kaggle/input/home-credit-credit-risk-model-stability"

TRAIN_DIR = os.path.join(ROOT, "parquet_files", "train")
TEST_DIR = os.path.join(ROOT, "parquet_files", "test")

### Def utility classes

In [4]:
from src.utils.utility import Utility
from src.utils.schema_gen import SchemaGen
from src.utils.pipeline import Pipeline
from src.utils.aggregator import Aggregator

### Read train data

In [5]:
%%time

data_store: dict = {
    "df_base": SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_base.parquet")),
    "depth_0": [
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_static_cb_0.parquet")),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_static_0_*.parquet")),
    ],
    "depth_1": [
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_applprev_1_*.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_tax_registry_a_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_tax_registry_b_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_tax_registry_c_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_a_1_*.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_b_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_other_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_person_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_deposit_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_debitcard_1.parquet"), 1),
    ],
    "depth_2": [
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_a_2_*.parquet"), 2),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_b_2.parquet"), 2),
    ],
}

train_df: pl.LazyFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(Pipeline.filter_cols)
    .pipe(Pipeline.transform_cols)
    .pipe(Pipeline.handle_dates)
    .pipe(Utility.reduce_memory_usage, "train_df")
)

del data_store
gc.collect()

print(f"Train data shape: {train_df.shape}")
display(train_df.head(10))

File train_base loaded into memory.
File train_static_cb_0 loaded into memory.
File train_static_0_0 loaded into memory.
File train_static_0_1 loaded into memory.
File train_applprev_1_1 loaded into memory.
File train_applprev_1_0 loaded into memory.
File train_tax_registry_a_1 loaded into memory.
File train_tax_registry_b_1 loaded into memory.
File train_tax_registry_c_1 loaded into memory.
File train_credit_bureau_a_1_3 loaded into memory.
File train_credit_bureau_a_1_2 loaded into memory.
File train_credit_bureau_a_1_0 loaded into memory.
File train_credit_bureau_a_1_1 loaded into memory.
File train_credit_bureau_b_1 loaded into memory.
File train_other_1 loaded into memory.
File train_person_1 loaded into memory.
File train_deposit_1 loaded into memory.
File train_debitcard_1 loaded into memory.
File train_credit_bureau_a_2_6 loaded into memory.
File train_credit_bureau_a_2_1 loaded into memory.
File train_credit_bureau_a_2_0 loaded into memory.
File train_credit_bureau_a_2_7 loade

case_id,month,week_num,target,assignmentdate_238D,assignmentdate_4527235D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,days120_123L,days180_256L,days30_165L,days360_512L,days90_310L,description_5085714M,education_1103M,education_88M,firstquarter_103L,fourthquarter_440L,maritalst_385M,maritalst_893M,numberofqueries_373L,pmtaverage_3A,pmtaverage_4527227A,pmtcount_4527229L,pmtcount_693L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,responsedate_1012D,responsedate_4527233D,responsedate_4917613D,secondquarter_766L,thirdquarter_1082L,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,…,mean_mainoccupationinc_384A,max_amount_416A,max_num_group1_10,max_openingdate_313D,mean_amount_416A,mean_openingdate_313D,max_num_group1_11,max_openingdate_857D,mean_openingdate_857D,max_collater_typofvalofguarant_298M,max_collater_typofvalofguarant_407M,max_collater_valueofguarantee_1124L,max_collater_valueofguarantee_876L,max_collaterals_typeofguarante_359M,max_collaterals_typeofguarante_669M,max_num_group1_12,max_num_group2,max_pmts_dpd_1073P,max_pmts_dpd_303P,max_pmts_month_158T,max_pmts_month_706T,max_pmts_overdue_1140A,max_pmts_overdue_1152A,max_pmts_year_1139T,max_pmts_year_507T,max_subjectroles_name_541M,max_subjectroles_name_838M,mean_pmts_dpd_1073P,mean_pmts_dpd_303P,mean_pmts_overdue_1140A,mean_pmts_overdue_1152A,var_pmts_dpd_1073P,var_pmts_dpd_303P,var_pmts_overdue_1140A,var_pmts_overdue_1152A,year,day
u32,u32,u8,u8,i16,u8,i16,f32,i32,f32,f32,f32,f32,f32,str,str,str,f32,f32,str,str,f32,f32,f32,f32,f32,f32,f32,str,i8,u8,i8,f32,f32,f32,f32,f32,…,f32,f32,u8,i16,f32,i16,u8,i16,i16,str,str,f32,f32,str,str,u16,u8,f32,f32,f32,f32,f32,f32,f32,f32,str,str,f32,f32,f32,f32,f32,f32,f32,f32,u16,u8
996492,202006,77,0,,,,58543.890625,-16675.0,3.0,10.0,0.0,16.0,2.0,"""2fc785b2""","""6b2ae0fa""","""a55475b1""",9.0,8.0,"""3439d993""","""a55475b1""",16.0,,,,,,,,,,14.0,7.0,10.0,0.0,,7229.600098,…,120000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,220000.0,"""c7a5ad39""","""c7a5ad39""",9.0,35.0,2.0,773.0,12.0,12.0,2501.083984,73150.75,2021.0,2021.0,"""ab3c25cf""","""ab3c25cf""",0.046512,189.512192,58.164745,13512.788086,0.093023,48364.976562,145474.90625,449108448.0,2020,27
657960,201903,11,0,,,-17707.0,,-17707.0,3.0,3.0,0.0,3.0,3.0,"""a55475b1""","""a55475b1""","""a55475b1""",3.0,0.0,"""a55475b1""","""a55475b1""",3.0,,,,,10.0,5660.0,,14.0,,,0.0,0.0,,,3728.0,…,52000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,,"""a55475b1""","""c7a5ad39""",0.0,35.0,0.0,,12.0,,0.0,,2020.0,,"""a55475b1""","""ab3c25cf""",0.0,,0.0,,0.0,,0.0,,2019,25
221606,202004,68,0,,,,,-9644.0,4.0,4.0,3.0,6.0,3.0,"""a55475b1""","""a55475b1""","""a55475b1""",3.0,4.0,"""a7fcb6e5""","""a55475b1""",6.0,,,,,,,,,,13.0,5.0,1.0,0.0,41887.402344,1771.400024,…,30000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",4.0,35.0,0.0,619.0,12.0,12.0,0.0,92360.335938,2021.0,2021.0,"""ab3c25cf""","""ab3c25cf""",0.0,140.888885,0.0,29128.238281,0.0,41156.136719,0.0,1490900000.0,2020,27
104033,201901,1,0,-550.0,,-23568.0,,-23568.0,1.0,1.0,0.0,4.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",2.0,2.0,"""a55475b1""","""a55475b1""",4.0,6827.766602,,,6.0,,,,14.0,,,1.0,2.0,0.0,,2589.199951,…,40000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,,"""a55475b1""","""c7a5ad39""",1.0,23.0,22.0,,12.0,,2051.852051,,2019.0,,"""a55475b1""","""ab3c25cf""",3.166667,,228.218109,,54.14706,,440159.28125,,2019,9
1840794,202005,71,0,,,,,-11579.0,1.0,2.0,0.0,5.0,1.0,"""a55475b1""","""a55475b1""","""a55475b1""",4.0,8.0,"""a55475b1""","""a55475b1""",5.0,,,,,,,,,,14.0,6.0,5.0,0.0,0.0,4812.0,…,30000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",0.0,35.0,0.0,0.0,12.0,12.0,0.0,0.0,2021.0,2018.0,"""ab3c25cf""","""ab3c25cf""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2020,15
716413,201906,23,0,,,-16420.0,,-16420.0,0.0,0.0,0.0,5.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",3.0,2.0,"""a55475b1""","""a55475b1""",5.0,,,,,10.0,8432.400391,,14.0,,,2.0,5.0,,,1314.400024,…,30000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,,"""a55475b1""","""c7a5ad39""",0.0,35.0,0.0,,12.0,,0.0,,2020.0,,"""a55475b1""","""ab3c25cf""",0.0,,0.0,,0.0,,0.0,,2019,15
634785,201902,6,0,,,-17944.0,,-17944.0,3.0,3.0,1.0,3.0,3.0,"""a55475b1""","""a55475b1""","""a55475b1""",1.0,2.0,"""a55475b1""","""a55475b1""",3.0,,,,,1.0,20.0,,14.0,,,0.0,0.0,,,3205.400146,…,24000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,,"""a55475b1""","""c7a5ad39""",0.0,23.0,1.0,,12.0,,2812.332031,,2020.0,,"""a55475b1""","""ab3c25cf""",0.1,,281.233185,,0.1,,790921.125,,2019,17
1423432,201906,25,0,,,-13176.0,,-13176.0,1.0,2.0,0.0,6.0,1.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",5.0,4.0,"""38c061ee""","""a55475b1""",6.0,,,,,10.0,14081.600586,,14.0,,,3.0,5.0,0.0,137179.828125,7569.0,…,54000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",10.0,35.0,7.0,321.0,12.0,12.0,10848.306641,51535.964844,2020.0,2020.0,"""ab3c25cf""","""ab3c25cf""",0.590909,13.94375,964.147522,3524.454102,2.919914,3008.065918,9250528.0,157700144.0,2019,28
1649474,201911,47,0,,14.0,,,-25227.0,0.0,0.0,0.0,0.0,0.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",0.0,2.0,"""3439d993""","""a55475b1""",0.0,,8549.600586,6.0,,,,"""PENSION_6""",,14.0,,0.0,1.0,0.0,90345.382812,2000.0,…,44000.0,,,,,,,,,"""a55475b1""","""a55475b1""",,0.0,"""c7a5ad39""","""a55475b1""",7.0,35.0,,50.0,,12.0,,3935.199951,,2020.0,"""ab3c25cf""","""a55475b1""",,1.571429,,140.666138,,51.416466,,496101.09375,2019,26
670509,201904,14,0,,,-10117.0,,,,,,,,"""a55475b1""","""a55475b1""","""a55475b1""",,,"""a55475b1""","""a55475b1""",,,,,,6.0,2244.803955,,14.0,,,,,,,827.400024,…,32000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2019,13


CPU times: user 5min 46s, sys: 1min 43s, total: 7min 29s
Wall time: 2min 16s


In [6]:
train_df, cat_cols = Utility.to_pandas(train_df)
cols = train_df.columns

del train_df
gc.collect()

0

### Read test data

In [7]:
data_store = {
    "df_base": SchemaGen.scan_files(os.path.join(TEST_DIR, "test_base.parquet")),
    "depth_0": [
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_static_cb_0.parquet")),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_static_0_*.parquet")),
    ],
    "depth_1": [
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_applprev_1_*.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_tax_registry_a_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_tax_registry_b_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_tax_registry_c_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_credit_bureau_a_1_*.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_credit_bureau_b_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_other_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_person_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_deposit_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_debitcard_1.parquet"), 1),
    ],
    "depth_2": [
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_credit_bureau_a_2_*.parquet"), 2),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_credit_bureau_b_2.parquet"), 2),
        # SchemaGen.scan_files(os.path.join(TEST_DIR, "test_applprev_2.parquet"), 2),
        # SchemaGen.scan_files(os.path.join(TEST_DIR, "test_person_2.parquet"), 2)
    ],
}

test_df: pl.DataFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(Pipeline.transform_cols)
    .pipe(Pipeline.handle_dates)
    .select([v for v in cols if v != "target"])
    .pipe(Utility.reduce_memory_usage, "test_df")
)

del data_store
gc.collect()

print(f"Test data shape: {test_df.shape}")

File test_base loaded into memory.
File test_static_cb_0 loaded into memory.
File test_static_0_0 loaded into memory.
File test_static_0_2 loaded into memory.
File test_static_0_1 loaded into memory.
File test_applprev_1_2 loaded into memory.
File test_applprev_1_0 loaded into memory.
File test_applprev_1_1 loaded into memory.
File test_tax_registry_a_1 loaded into memory.
File test_tax_registry_b_1 loaded into memory.
File test_tax_registry_c_1 loaded into memory.
File test_credit_bureau_a_1_3 loaded into memory.
File test_credit_bureau_a_1_2 loaded into memory.
File test_credit_bureau_a_1_1 loaded into memory.
File test_credit_bureau_a_1_4 loaded into memory.
File test_credit_bureau_a_1_0 loaded into memory.
File test_credit_bureau_b_1 loaded into memory.
File test_other_1 loaded into memory.
File test_person_1 loaded into memory.
File test_deposit_1 loaded into memory.
File test_debitcard_1 loaded into memory.
File test_credit_bureau_a_2_3 loaded into memory.
File test_credit_bureau

In [8]:
test_df, cat_cols = Utility.to_pandas(test_df, cat_cols)

In [9]:
test_df

Unnamed: 0,case_id,month,week_num,assignmentdate_238D,assignmentdate_4527235D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,days120_123L,days180_256L,...,mean_pmts_dpd_1073P,mean_pmts_dpd_303P,mean_pmts_overdue_1140A,mean_pmts_overdue_1152A,var_pmts_dpd_1073P,var_pmts_dpd_303P,var_pmts_overdue_1140A,var_pmts_overdue_1152A,year,day
0,57630,202201,100,,,,499975.0,-19767,1.0,2.0,...,0.0,0.0,0.0,0.0,,,,,2021,16
1,57632,202201,100,,,,17677.0,-23107,1.0,2.0,...,0.0,,0.0,,0.0,,0.0,,2022,5
2,57633,202201,100,,,,6373008.0,-10496,3.0,3.0,...,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,2022,25
3,57549,202201,100,,,,1563078.0,-22723,6.0,9.0,...,,,,,,,,,2022,17
4,57552,202201,100,,,,747031.8,-23768,2.0,2.0,...,,,,,,,,,2020,27
5,57634,202201,100,,,,15263.65,-16281,2.0,2.0,...,,,,,,,,,2021,27
6,57631,202201,100,,,,480334.5,-12999,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022,4
7,57551,202201,100,,,,2926195.0,-14090,1.0,3.0,...,,,,,,,,,2020,27
8,57569,202201,100,,,,,-26408,4.0,4.0,...,,2328.571533,,33346.402344,,3341.619141,,0.0,2021,20
9,57543,202201,100,,,,151364.0,-14804,2.0,4.0,...,0.0,,0.0,,,,,,2021,14


### Inference

In [10]:
with open('/kaggle/input/home-credit-crms-models/voting_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

In [11]:
submission_df: pd.DataFrame = pd.read_csv(os.path.join(ROOT, "sample_submission.csv"))
submission_df = submission_df.set_index("case_id")

X_test = test_df.drop(columns=["week_num"]).set_index("case_id")
X_test[cat_cols] = X_test[cat_cols].astype("category")

y_pred = pd.Series(loaded_model.predict_proba(X_test)[:, 1], index=X_test.index)
submission_df["score"] = y_pred
submission_df = submission_df.sort_index()


# Metric Hack
# https://www.kaggle.com/code/a520hh/fork-of-this-is-the-way/notebook
# condition = preds < 0.98
# submission_df.loc[condition, "score"] = (submission_df.loc[condition, "score"] - 0.073).clip(0)

submission_df

Unnamed: 0_level_0,score
case_id,Unnamed: 1_level_1
57543,0.008495
57549,0.05609
57551,0.002271
57552,0.020815
57569,0.170604
57630,0.007312
57631,0.024206
57632,0.006176
57633,0.037787
57634,0.022884


In [12]:
submission_df.to_csv("submission.csv")