## Baseline - Infer

※ Kaggle notebookのみで動かす

https://www.kaggle.com/code/pxcai666/catboost-lightgbm-ensemble/notebook

### Configuration

In [1]:
import sys
from pathlib import Path
import subprocess
import os
import gc
from glob import glob
import pickle

import numpy as np
import pandas as pd
import polars as pl
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit, GroupKFold, StratifiedGroupKFold
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier, Pool
import lightgbm as lgb

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer

In [2]:
# Kaggle
sys.path.append('/kaggle/input/home-credit-crms-repo')

In [3]:
ROOT = "/kaggle/input/home-credit-credit-risk-model-stability"

TRAIN_DIR = os.path.join(ROOT, "parquet_files", "train")
TEST_DIR = os.path.join(ROOT, "parquet_files", "test")

### Def utility classes

In [4]:
from src.utils.utility import Utility
from src.utils.schema_gen import SchemaGen
from src.utils.pipeline import Pipeline
from src.utils.aggregator import Aggregator

### Read train data

In [5]:
%%time

data_store: dict = {
    "df_base": SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_base.parquet")),
    "depth_0": [
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_static_cb_0.parquet")),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_static_0_*.parquet")),
    ],
    "depth_1": [
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_applprev_1_*.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_tax_registry_a_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_tax_registry_b_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_tax_registry_c_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_a_1_*.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_b_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_other_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_person_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_deposit_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_debitcard_1.parquet"), 1),
    ],
    "depth_2": [
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_a_2_*.parquet"), 2),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_b_2.parquet"), 2),
    ],
}

train_df: pl.LazyFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(Pipeline.filter_cols)
    .pipe(Pipeline.transform_cols)
    .pipe(Pipeline.handle_dates)
    .pipe(Utility.reduce_memory_usage, "train_df")
)

del data_store
gc.collect()

print(f"Train data shape: {train_df.shape}")
display(train_df.head(10))

File train_base loaded into memory.
File train_static_cb_0 loaded into memory.
File train_static_0_0 loaded into memory.
File train_static_0_1 loaded into memory.
File train_applprev_1_1 loaded into memory.
File train_applprev_1_0 loaded into memory.
File train_tax_registry_a_1 loaded into memory.
File train_tax_registry_b_1 loaded into memory.
File train_tax_registry_c_1 loaded into memory.
File train_credit_bureau_a_1_3 loaded into memory.
File train_credit_bureau_a_1_2 loaded into memory.
File train_credit_bureau_a_1_0 loaded into memory.
File train_credit_bureau_a_1_1 loaded into memory.
File train_credit_bureau_b_1 loaded into memory.
File train_other_1 loaded into memory.
File train_person_1 loaded into memory.
File train_deposit_1 loaded into memory.
File train_debitcard_1 loaded into memory.
File train_credit_bureau_a_2_6 loaded into memory.
File train_credit_bureau_a_2_1 loaded into memory.
File train_credit_bureau_a_2_0 loaded into memory.
File train_credit_bureau_a_2_7 loade

case_id,month,week_num,target,assignmentdate_238D,assignmentdate_4527235D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,days120_123L,days180_256L,days30_165L,days360_512L,days90_310L,description_5085714M,education_1103M,education_88M,firstquarter_103L,fourthquarter_440L,maritalst_385M,maritalst_893M,numberofqueries_373L,pmtaverage_3A,pmtaverage_4527227A,pmtcount_4527229L,pmtcount_693L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,responsedate_1012D,responsedate_4527233D,responsedate_4917613D,secondquarter_766L,thirdquarter_1082L,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,…,mean_mainoccupationinc_384A,max_amount_416A,max_num_group1_10,max_openingdate_313D,mean_amount_416A,mean_openingdate_313D,max_num_group1_11,max_openingdate_857D,mean_openingdate_857D,max_collater_typofvalofguarant_298M,max_collater_typofvalofguarant_407M,max_collater_valueofguarantee_1124L,max_collater_valueofguarantee_876L,max_collaterals_typeofguarante_359M,max_collaterals_typeofguarante_669M,max_num_group1_12,max_num_group2,max_pmts_dpd_1073P,max_pmts_dpd_303P,max_pmts_month_158T,max_pmts_month_706T,max_pmts_overdue_1140A,max_pmts_overdue_1152A,max_pmts_year_1139T,max_pmts_year_507T,max_subjectroles_name_541M,max_subjectroles_name_838M,mean_pmts_dpd_1073P,mean_pmts_dpd_303P,mean_pmts_overdue_1140A,mean_pmts_overdue_1152A,var_pmts_dpd_1073P,var_pmts_dpd_303P,var_pmts_overdue_1140A,var_pmts_overdue_1152A,year,day
u32,u32,u8,u8,i16,u8,i16,f32,i32,f32,f32,f32,f32,f32,str,str,str,f32,f32,str,str,f32,f32,f32,f32,f32,f32,f32,str,i8,u8,i8,f32,f32,f32,f32,f32,…,f32,f32,u8,i16,f32,i16,u8,i16,i16,str,str,f32,f32,str,str,u16,u8,f32,f32,f32,f32,f32,f32,f32,f32,str,str,f32,f32,f32,f32,f32,f32,f32,f32,u16,u8
131126,201905,20,0,,,-16398.0,,-16398.0,3.0,5.0,3.0,5.0,3.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",0.0,5.0,"""a7fcb6e5""","""a55475b1""",5.0,,,,,2.0,2258.503906,,12.0,,,5.0,1.0,0.0,0.0,1902.800049,…,36000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,,"""a55475b1""","""c7a5ad39""",1.0,23.0,5.0,,12.0,,4285.800293,,2020.0,,"""a55475b1""","""ab3c25cf""",0.315789,,276.43158,,1.339181,,991674.375,,2019,24
1458400,201907,29,0,,,-12470.0,,-12470.0,2.0,2.0,1.0,2.0,2.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",0.0,0.0,"""3439d993""","""a55475b1""",2.0,,,,,5.0,5564.0,,2.0,,,2.0,1.0,0.0,13139.400391,2301.0,…,60000.0,,,,,,,,,"""a55475b1""","""a55475b1""",,0.0,"""c7a5ad39""","""a55475b1""",7.0,35.0,,1519.0,,12.0,,14599.981445,,2020.0,"""ab3c25cf""","""a55475b1""",,187.682922,,2954.826416,,230112.09375,,23780864.0,2019,23
608531,201901,1,0,,,-8626.0,,,,,,,,"""a55475b1""","""a55475b1""","""a55475b1""",,,"""a55475b1""","""a55475b1""",,,,,,7.0,16118.807617,,14.0,,,,,,,1174.400024,…,26000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2019,12
1815133,202003,62,0,,,,,-14165.0,4.0,4.0,2.0,8.0,3.0,"""a55475b1""","""6b2ae0fa""","""6b2ae0fa""",2.0,3.0,"""38c061ee""","""e18430ff""",8.0,,,,,,,"""DEDUCTION_6""",,14.0,14.0,3.0,3.0,0.0,2686.400146,2210.400146,…,76000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,75386.0,"""c7a5ad39""","""c7a5ad39""",3.0,35.0,25.0,1212.0,12.0,12.0,2659.600098,121354.179688,2021.0,2020.0,"""ab3c25cf""","""ab3c25cf""",5.230769,487.612915,204.798309,41923.726562,96.192307,171961.125,544018.9375,1413200000.0,2020,13
166678,201910,40,0,,,,,-13618.0,2.0,3.0,2.0,8.0,2.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",6.0,6.0,"""3439d993""","""a55475b1""",8.0,,,,,,,"""DEDUCTION_6""",,14.0,,8.0,11.0,0.0,92927.601562,9242.799805,…,100000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",9.0,35.0,7.0,298.0,12.0,12.0,13152.408203,23690.824219,2020.0,2020.0,"""ab3c25cf""","""ab3c25cf""",0.432432,12.828359,1009.878723,1508.235107,2.084413,2656.158203,9402565.0,17405034.0,2019,13
186995,201912,50,0,,14.0,,,-23516.0,1.0,2.0,0.0,3.0,1.0,"""a55475b1""","""a55475b1""","""a55475b1""",1.0,2.0,"""3439d993""","""46b968c3""",3.0,,7222.200195,6.0,,,,"""PENSION_6""",,14.0,,1.0,3.0,0.0,143365.265625,1415.200073,…,37564.0,289.613983,0.0,-2145.0,289.613983,-2145.0,0.0,-2145.0,-2145.0,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",4.0,35.0,0.0,29.0,12.0,12.0,0.0,2490.600098,2020.0,2020.0,"""ab3c25cf""","""ab3c25cf""",0.0,1.014084,0.0,35.523945,0.0,24.242657,0.0,87348.460938,2019,19
980647,202004,68,0,,,,,-13291.0,0.0,0.0,0.0,0.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,0.0,"""a55475b1""","""a55475b1""",0.0,,,,,,,,,,14.0,0.0,0.0,,,1509.200073,…,24000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,,"""a55475b1""","""c7a5ad39""",0.0,11.0,0.0,,12.0,,0.0,,2021.0,,"""a55475b1""","""ab3c25cf""",0.0,,0.0,,0.0,,0.0,,2020,21
2558945,201904,13,0,,,-13792.0,,-13792.0,5.0,5.0,2.0,6.0,4.0,"""a55475b1""","""a55475b1""","""a55475b1""",4.0,2.0,"""a55475b1""","""a55475b1""",6.0,,,,,2.0,2584.400146,,14.0,,,0.0,0.0,0.0,12888.799805,3678.600098,…,56000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,,"""a55475b1""","""c7a5ad39""",0.0,23.0,0.0,,12.0,,0.0,,2020.0,,"""a55475b1""","""ab3c25cf""",0.0,,0.0,,0.0,,0.0,,2019,5
138215,201906,24,0,-4297.0,,-23703.0,,-23703.0,4.0,4.0,1.0,7.0,4.0,"""a55475b1""","""c8e1a1d0""","""a55475b1""",1.0,4.0,"""3439d993""","""a55475b1""",7.0,7512.0,,,6.0,,,,14.0,,,7.0,3.0,0.0,0.0,4613.600098,…,90000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",5.0,35.0,5.0,163.0,12.0,12.0,1632.800049,54583.0,2020.0,2020.0,"""ab3c25cf""","""ab3c25cf""",0.333333,12.5,80.076736,3223.135254,1.195402,932.558838,104806.03125,127985360.0,2019,24
801572,201909,36,0,,,,,-11305.0,3.0,5.0,2.0,17.0,3.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",22.0,13.0,"""a7fcb6e5""","""a55475b1""",17.0,,,,,6.0,12613.600586,"""DEDUCTION_6""",14.0,14.0,,9.0,11.0,0.0,,3282.0,…,46000.0,,,,,,,,,"""a55475b1""","""a55475b1""",5038000.0,0.0,"""c7a5ad39""","""c7a5ad39""",7.0,35.0,0.0,35.0,12.0,12.0,0.0,5217.200195,2020.0,2020.0,"""ab3c25cf""","""ab3c25cf""",0.0,1.814159,0.0,438.676819,0.0,21.349083,0.0,955230.1875,2019,14


CPU times: user 5min 45s, sys: 1min 42s, total: 7min 28s
Wall time: 2min 18s


### Read test data

In [6]:
data_store = {
    "df_base": SchemaGen.scan_files(os.path.join(TEST_DIR, "test_base.parquet")),
    "depth_0": [
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_static_cb_0.parquet")),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_static_0_*.parquet")),
    ],
    "depth_1": [
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_applprev_1_*.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_tax_registry_a_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_tax_registry_b_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_tax_registry_c_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_credit_bureau_a_1_*.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_credit_bureau_b_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_other_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_person_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_deposit_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_debitcard_1.parquet"), 1),
    ],
    "depth_2": [
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_credit_bureau_b_2.parquet"), 2),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_credit_bureau_a_2_*.parquet"), 2),
        # SchemaGen.scan_files(os.path.join(TEST_DIR, "test_applprev_2.parquet"), 2),
        # SchemaGen.scan_files(os.path.join(TEST_DIR, "test_person_2.parquet"), 2)
    ],
}

test_df: pl.DataFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(Pipeline.transform_cols)
    .pipe(Pipeline.handle_dates)
    .select([col for col in train_df.columns if col != "target"])
    .pipe(Utility.reduce_memory_usage, "test_df")
)

del data_store
gc.collect()

print(f"Test data shape: {test_df.shape}")

File test_base loaded into memory.
File test_static_cb_0 loaded into memory.
File test_static_0_0 loaded into memory.
File test_static_0_2 loaded into memory.
File test_static_0_1 loaded into memory.
File test_applprev_1_2 loaded into memory.
File test_applprev_1_0 loaded into memory.
File test_applprev_1_1 loaded into memory.
File test_tax_registry_a_1 loaded into memory.
File test_tax_registry_b_1 loaded into memory.
File test_tax_registry_c_1 loaded into memory.
File test_credit_bureau_a_1_3 loaded into memory.
File test_credit_bureau_a_1_2 loaded into memory.
File test_credit_bureau_a_1_1 loaded into memory.
File test_credit_bureau_a_1_4 loaded into memory.
File test_credit_bureau_a_1_0 loaded into memory.
File test_credit_bureau_b_1 loaded into memory.
File test_other_1 loaded into memory.
File test_person_1 loaded into memory.
File test_deposit_1 loaded into memory.
File test_debitcard_1 loaded into memory.
File test_credit_bureau_b_2 loaded into memory.
File test_credit_bureau_a

In [7]:
train_df, cat_cols = Utility.to_pandas(train_df)
test_df, cat_cols = Utility.to_pandas(test_df, cat_cols)

del train_df
gc.collect()

0

In [8]:
test_df

Unnamed: 0,case_id,month,week_num,assignmentdate_238D,assignmentdate_4527235D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,days120_123L,days180_256L,...,mean_pmts_dpd_1073P,mean_pmts_dpd_303P,mean_pmts_overdue_1140A,mean_pmts_overdue_1152A,var_pmts_dpd_1073P,var_pmts_dpd_303P,var_pmts_overdue_1140A,var_pmts_overdue_1152A,year,day
0,57633,202201,100,,,,6373008.0,-10496,3.0,3.0,...,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,2022,25
1,57549,202201,100,,,,1563078.0,-22723,6.0,9.0,...,,,,,,,,,2022,17
2,57551,202201,100,,,,2926195.0,-14090,1.0,3.0,...,,,,,,,,,2020,27
3,57569,202201,100,,,,,-26408,4.0,4.0,...,,2328.571533,,33346.402344,,3341.619141,,0.0,2021,20
4,57634,202201,100,,,,15263.65,-16281,2.0,2.0,...,,,,,,,,,2021,27
5,57543,202201,100,,,,151364.0,-14804,2.0,4.0,...,0.0,,0.0,,,,,,2021,14
6,57630,202201,100,,,,499975.0,-19767,1.0,2.0,...,0.0,0.0,0.0,0.0,,,,,2021,16
7,57631,202201,100,,,,480334.5,-12999,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022,4
8,57552,202201,100,,,,747031.8,-23768,2.0,2.0,...,,,,,,,,,2020,27
9,57632,202201,100,,,,17677.0,-23107,1.0,2.0,...,0.0,,0.0,,0.0,,0.0,,2022,5


### Inference

In [9]:
X_test = test_df.drop(columns=["week_num"]).set_index("case_id")
X_test[cat_cols] = X_test[cat_cols].astype("category")
# X_test[cat_cols] = X_test[cat_cols].apply(lambda x: x.cat.add_categories(['NaN']).fillna('NaN'))

# preds = pd.Series(loaded_model.predict_proba(X_test)[:, 1], index=test_df.index)

submission_df = pd.DataFrame(index=X_test.index)
# submission_df["score"] = preds
submission_df["score"] = 0.1

# Metric Hack
# https://www.kaggle.com/code/a520hh/fork-of-this-is-the-way/notebook
# condition = preds < 0.98
# submission_df.loc[condition, "score"] = (submission_df.loc[condition, "score"] - 0.073).clip(0)

submission_df

Unnamed: 0_level_0,score
case_id,Unnamed: 1_level_1
57633,0.1
57549,0.1
57551,0.1
57569,0.1
57634,0.1
57543,0.1
57630,0.1
57631,0.1
57552,0.1
57632,0.1


In [10]:
submission_df.to_csv("submission.csv")