## Baseline - Infer

※ Kaggle notebookのみで動かす

https://www.kaggle.com/code/pxcai666/catboost-lightgbm-ensemble/notebook

### Configuration

In [1]:
import sys
from pathlib import Path
import subprocess
import os
import gc
from glob import glob
import pickle

import numpy as np
import pandas as pd
import polars as pl
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit, GroupKFold, StratifiedGroupKFold
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier, Pool
import lightgbm as lgb

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer

In [2]:
# Kaggle
sys.path.append('/kaggle/input/home-credit-crms-repo')

In [3]:
ROOT = "/kaggle/input/home-credit-credit-risk-model-stability"

TRAIN_DIR = os.path.join(ROOT, "parquet_files", "train")
TEST_DIR = os.path.join(ROOT, "parquet_files", "test")

### Def utility classes

In [4]:
from src.utils.utility import Utility
from src.utils.schema_gen import SchemaGen
from src.utils.pipeline import Pipeline
from src.utils.aggregator import Aggregator

### Read train data

In [5]:
%%time

data_store: dict = {
    "df_base": SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_base.parquet")),
    "depth_0": [
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_static_cb_0.parquet")),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_static_0_*.parquet")),
    ],
    "depth_1": [
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_applprev_1_*.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_tax_registry_a_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_tax_registry_b_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_tax_registry_c_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_a_1_*.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_b_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_other_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_person_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_deposit_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_debitcard_1.parquet"), 1),
    ],
    "depth_2": [
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_a_2_*.parquet"), 2),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_b_2.parquet"), 2),
    ],
}

train_df: pl.LazyFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(Pipeline.filter_cols)
    .pipe(Pipeline.transform_cols)
    .pipe(Pipeline.handle_dates)
    .pipe(Utility.reduce_memory_usage, "train_df")
)

del data_store
gc.collect()

print(f"Train data shape: {train_df.shape}")
display(train_df.head(10))

File train_base loaded into memory.
File train_static_cb_0 loaded into memory.
File train_static_0_0 loaded into memory.
File train_static_0_1 loaded into memory.
File train_applprev_1_1 loaded into memory.
File train_applprev_1_0 loaded into memory.
File train_tax_registry_a_1 loaded into memory.
File train_tax_registry_b_1 loaded into memory.
File train_tax_registry_c_1 loaded into memory.
File train_credit_bureau_a_1_3 loaded into memory.
File train_credit_bureau_a_1_2 loaded into memory.
File train_credit_bureau_a_1_0 loaded into memory.
File train_credit_bureau_a_1_1 loaded into memory.
File train_credit_bureau_b_1 loaded into memory.
File train_other_1 loaded into memory.
File train_person_1 loaded into memory.
File train_deposit_1 loaded into memory.
File train_debitcard_1 loaded into memory.
File train_credit_bureau_a_2_6 loaded into memory.
File train_credit_bureau_a_2_1 loaded into memory.
File train_credit_bureau_a_2_0 loaded into memory.
File train_credit_bureau_a_2_7 loade

case_id,month,week_num,target,assignmentdate_238D,assignmentdate_4527235D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,days120_123L,days180_256L,days30_165L,days360_512L,days90_310L,description_5085714M,education_1103M,education_88M,firstquarter_103L,fourthquarter_440L,maritalst_385M,maritalst_893M,numberofqueries_373L,pmtaverage_3A,pmtaverage_4527227A,pmtcount_4527229L,pmtcount_693L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,responsedate_1012D,responsedate_4527233D,responsedate_4917613D,secondquarter_766L,thirdquarter_1082L,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,…,mean_mainoccupationinc_384A,max_amount_416A,max_num_group1_10,max_openingdate_313D,mean_amount_416A,mean_openingdate_313D,max_num_group1_11,max_openingdate_857D,mean_openingdate_857D,max_collater_typofvalofguarant_298M,max_collater_typofvalofguarant_407M,max_collater_valueofguarantee_1124L,max_collater_valueofguarantee_876L,max_collaterals_typeofguarante_359M,max_collaterals_typeofguarante_669M,max_num_group1_12,max_num_group2,max_pmts_dpd_1073P,max_pmts_dpd_303P,max_pmts_month_158T,max_pmts_month_706T,max_pmts_overdue_1140A,max_pmts_overdue_1152A,max_pmts_year_1139T,max_pmts_year_507T,max_subjectroles_name_541M,max_subjectroles_name_838M,mean_pmts_dpd_1073P,mean_pmts_dpd_303P,mean_pmts_overdue_1140A,mean_pmts_overdue_1152A,var_pmts_dpd_1073P,var_pmts_dpd_303P,var_pmts_overdue_1140A,var_pmts_overdue_1152A,year,day
u32,u32,u8,u8,i16,u8,i16,f32,i32,f32,f32,f32,f32,f32,str,str,str,f32,f32,str,str,f32,f32,f32,f32,f32,f32,f32,str,i8,u8,i8,f32,f32,f32,f32,f32,…,f32,f32,u8,i16,f32,i16,u8,i16,i16,str,str,f32,f32,str,str,u16,u8,f32,f32,f32,f32,f32,f32,f32,f32,str,str,f32,f32,f32,f32,f32,f32,f32,f32,u16,u8
1561462,201910,39,0,,,,,-15309.0,0.0,0.0,0.0,1.0,0.0,"""a55475b1""","""717ddd49""","""a55475b1""",2.0,2.0,"""3439d993""","""a55475b1""",1.0,,,,,6.0,25813.0,"""DEDUCTION_6""",14.0,14.0,,3.0,2.0,0.0,13681.400391,6163.0,…,100000.0,,,,,,,,,"""a55475b1""","""a55475b1""",,0.0,"""c7a5ad39""","""a55475b1""",6.0,35.0,,0.0,,12.0,,0.0,,2020.0,"""ab3c25cf""","""a55475b1""",,0.0,,0.0,,0.0,,0.0,2019,1
877544,201911,47,0,,,,,-15217.0,0.0,0.0,0.0,0.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",1.0,0.0,"""a55475b1""","""a55475b1""",0.0,,,,,,,"""DEDUCTION_6""",,14.0,,2.0,3.0,,,2979.0,…,40000.0,,,,,,,,,"""a55475b1""","""a55475b1""",3100000.0,0.0,"""c7a5ad39""","""c7a5ad39""",3.0,35.0,0.0,0.0,12.0,12.0,0.0,0.0,2020.0,2018.0,"""ab3c25cf""","""ab3c25cf""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2019,29
1714706,201912,51,1,,,,,-18715.0,1.0,3.0,0.0,4.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",8.0,2.0,"""a55475b1""","""a55475b1""",4.0,,,,,,,"""DEDUCTION_6""",,14.0,,5.0,8.0,0.0,339838.25,3119.600098,…,70000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,3713700.0,"""c7a5ad39""","""c7a5ad39""",3.0,35.0,0.0,8.0,12.0,12.0,0.0,0.0,2020.0,2020.0,"""ab3c25cf""","""ab3c25cf""",0.0,0.571429,0.0,0.0,0.0,4.571429,0.0,0.0,2019,28
1496334,201908,32,0,,,-12954.0,,-12954.0,0.0,0.0,0.0,4.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",6.0,14.0,"""a55475b1""","""a55475b1""",4.0,,,,,8.0,25562.201172,,14.0,,,2.0,4.0,0.0,0.0,7005.800293,…,34000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",1.0,35.0,0.0,20.0,12.0,12.0,0.0,2.6,2020.0,2018.0,"""ab3c25cf""","""ab3c25cf""",0.0,3.222222,0.0,0.577778,0.0,48.444443,0.0,1.314445,2019,19
1832273,202004,65,0,,,,,-18598.0,0.0,0.0,0.0,0.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",1.0,2.0,"""a55475b1""","""a55475b1""",0.0,,,,,,,"""DEDUCTION_6""",,14.0,,0.0,0.0,0.0,0.0,2135.400146,…,66000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",2.0,35.0,0.0,0.0,12.0,12.0,0.0,0.0,2021.0,2020.0,"""ab3c25cf""","""ab3c25cf""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2020,1
704048,201906,22,0,,,-17231.0,,-17231.0,0.0,0.0,0.0,0.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,0.0,"""a55475b1""","""a55475b1""",0.0,,,,,1.0,2400.0,,14.0,,,0.0,0.0,0.0,,1512.200073,…,50000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,,"""a55475b1""","""c7a5ad39""",0.0,35.0,1042.0,,12.0,,11440.75,,2020.0,,"""a55475b1""","""ab3c25cf""",727.583313,,11440.75,,37433.296875,,0.0,,2019,5
1430116,201907,26,0,,,-17899.0,,-17899.0,1.0,2.0,0.0,3.0,1.0,"""a55475b1""","""717ddd49""","""a55475b1""",1.0,0.0,"""3439d993""","""a55475b1""",3.0,,,,,6.0,3480.0,,14.0,,,4.0,2.0,0.0,17185.0,4177.800293,…,60000.0,,,,,,,,,"""a55475b1""","""a55475b1""",3203000.0,0.0,"""c7a5ad39""","""f4d8a027""",3.0,35.0,0.0,0.0,12.0,12.0,0.0,0.0,2020.0,2020.0,"""ab3c25cf""","""ab3c25cf""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2019,3
2680925,202005,71,0,,,,,-19644.0,3.0,5.0,1.0,6.0,3.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",2.0,4.0,"""3439d993""","""a55475b1""",6.0,,,,,,,,,,,5.0,0.0,0.0,218773.25,3187.800049,…,42000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,171000.0,"""c7a5ad39""","""c7a5ad39""",13.0,35.0,0.0,24.0,12.0,12.0,0.0,7697.600098,2021.0,2021.0,"""ab3c25cf""","""ab3c25cf""",0.0,1.87931,0.0,321.030029,0.0,26.872263,0.0,1527223.0,2020,13
835254,201910,41,0,,,,,,,,,,,"""a55475b1""","""a55475b1""","""a55475b1""",,,"""a55475b1""","""a55475b1""",,,,,,,,"""DEDUCTION_6""",,14.0,,,,,,984.0,…,30000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2019,19
1774901,202002,57,0,,14.0,,,-25390.0,2.0,2.0,2.0,4.0,2.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",3.0,0.0,"""b6cabe76""","""a55475b1""",4.0,,19155.201172,6.0,,,,"""PENSION_6""",,14.0,,3.0,0.0,0.0,77976.0,3708.199951,…,26600.0,4916.706055,0.0,-1252.0,4916.706055,-1252.0,0.0,-1252.0,-1252.0,"""a55475b1""","""a55475b1""",,200000.0,"""c7a5ad39""","""a55475b1""",12.0,35.0,,1.0,,12.0,,2128.285889,,2020.0,"""ab3c25cf""","""a55475b1""",,0.010309,,21.941093,,0.010309,,46696.921875,2020,5


CPU times: user 5min 49s, sys: 1min 43s, total: 7min 33s
Wall time: 2min 16s


In [6]:
cols = train_df.columns
train_df, cat_cols = Utility.to_pandas(train_df)

del train_df
gc.collect()

0

### Read test data

In [7]:
data_store = {
    "df_base": SchemaGen.scan_files(os.path.join(TEST_DIR, "test_base.parquet")),
    "depth_0": [
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_static_cb_0.parquet")),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_static_0_*.parquet")),
    ],
    "depth_1": [
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_applprev_1_*.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_tax_registry_a_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_tax_registry_b_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_tax_registry_c_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_credit_bureau_a_1_*.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_credit_bureau_b_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_other_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_person_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_deposit_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_debitcard_1.parquet"), 1),
    ],
    "depth_2": [
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_credit_bureau_b_2.parquet"), 2),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_credit_bureau_a_2_*.parquet"), 2),
        # SchemaGen.scan_files(os.path.join(TEST_DIR, "test_applprev_2.parquet"), 2),
        # SchemaGen.scan_files(os.path.join(TEST_DIR, "test_person_2.parquet"), 2)
    ],
}

test_df: pl.DataFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(Pipeline.transform_cols)
    .pipe(Pipeline.handle_dates)
    .select([col for col in cols if col != "target"])
    .pipe(Utility.reduce_memory_usage, "test_df")
)

del data_store
gc.collect()

print(f"Test data shape: {test_df.shape}")

File test_base loaded into memory.
File test_static_cb_0 loaded into memory.
File test_static_0_0 loaded into memory.
File test_static_0_2 loaded into memory.
File test_static_0_1 loaded into memory.
File test_applprev_1_2 loaded into memory.
File test_applprev_1_0 loaded into memory.
File test_applprev_1_1 loaded into memory.
File test_tax_registry_a_1 loaded into memory.
File test_tax_registry_b_1 loaded into memory.
File test_tax_registry_c_1 loaded into memory.
File test_credit_bureau_a_1_3 loaded into memory.
File test_credit_bureau_a_1_2 loaded into memory.
File test_credit_bureau_a_1_1 loaded into memory.
File test_credit_bureau_a_1_4 loaded into memory.
File test_credit_bureau_a_1_0 loaded into memory.
File test_credit_bureau_b_1 loaded into memory.
File test_other_1 loaded into memory.
File test_person_1 loaded into memory.
File test_deposit_1 loaded into memory.
File test_debitcard_1 loaded into memory.
File test_credit_bureau_b_2 loaded into memory.
File test_credit_bureau_a

In [8]:
test_df, cat_cols = Utility.to_pandas(test_df, cat_cols)

### Inference

In [9]:
with open('/kaggle/input/home-credit-crms-models/voting_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

In [10]:
df_subm: pd.DataFrame = pd.read_csv(os.path.join(ROOT, "sample_submission.csv"))
df_subm = df_subm.set_index("case_id")

In [11]:
X_test: pd.DataFrame = test_df.drop(columns=["week_num"]).set_index("case_id")

X_test[cat_cols] = X_test[cat_cols].astype("category")
X_test[cat_cols] = X_test[cat_cols].apply(lambda x: x.cat.add_categories(['NaN']).fillna('NaN'))

y_pred: pd.Series = pd.Series(loaded_model.predict_proba(X_test)[:, 1], index=X_test.index)

df_subm["score"] = y_pred

display(df_subm)

df_subm.to_csv("submission.csv")

Unnamed: 0_level_0,score
case_id,Unnamed: 1_level_1
57543,0.00867
57549,0.056382
57551,0.002242
57552,0.020815
57569,0.170345
57630,0.007066
57631,0.021817
57632,0.005812
57633,0.036832
57634,0.022884


In [12]:
# test_df = test_df.drop(columns=["week_num"]).set_index("case_id")
# test_df[cat_cols] = test_df[cat_cols].astype("category")

# # preds = pd.Series(loaded_model.predict_proba(test_df)[:, 1], index=test_df.index)

# submission_df = pd.DataFrame(index=test_df.index)
# # submission_df["score"] = preds
# submission_df["score"] = 0.1

# # Metric Hack
# # https://www.kaggle.com/code/a520hh/fork-of-this-is-the-way/notebook
# # condition = preds < 0.98
# # submission_df.loc[condition, "score"] = (submission_df.loc[condition, "score"] - 0.073).clip(0)

# submission_df

In [13]:
# submission_df.to_csv("submission.csv")