## Baseline - Infer

※ Kaggle notebookのみで動かす

https://www.kaggle.com/code/pxcai666/catboost-lightgbm-ensemble/notebook

### Configuration

In [1]:
import sys
from pathlib import Path
import subprocess
import os
import gc
from glob import glob
import pickle

import numpy as np
import pandas as pd
import polars as pl
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit, GroupKFold, StratifiedGroupKFold
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier, Pool
import lightgbm as lgb

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer

In [2]:
print(lgb.__version__)

4.2.0


In [3]:
# Kaggle
sys.path.append('/kaggle/input/home-credit-crms-repo')

In [4]:
ROOT = Path("/kaggle/input/home-credit-credit-risk-model-stability")

TRAIN_DIR = os.path.join(ROOT, "parquet_files", "train")
TEST_DIR = os.path.join(ROOT, "parquet_files", "test")

### Def utility classes

In [5]:
from src.utils.utility import Utility
from src.utils.schema_gen import SchemaGen
# from src.utils.pipeline import Pipeline
from src.utils.aggregator import Aggregator

In [6]:
def filter_cols(df: pl.DataFrame) -> pl.DataFrame:
    """
    Filters columns in the DataFrame based on null percentage and unique values for string columns.

    Args:
    - df (pl.DataFrame): Input DataFrame.

    Returns:
    - pl.DataFrame: DataFrame with filtered columns.
    """
    for col in df.columns:
        if col not in ["case_id", "year", "month", "week_num", "target"]:
            null_pct = df[col].is_null().mean()

            if null_pct > 0.95:
                df = df.drop(col)

    for col in df.columns:
        if (col not in ["case_id", "year", "month", "week_num", "target"]) & (
            df[col].dtype == pl.String
        ):
            freq = df[col].n_unique()

            if (freq > 200) | (freq == 1):
                df = df.drop(col)

    return df


def transform_cols(df: pl.DataFrame) -> pl.DataFrame:
    """
    Transforms columns in the DataFrame according to predefined rules.

    Args:
    - df (pl.DataFrame): Input DataFrame.

    Returns:
    - pl.DataFrame: DataFrame with transformed columns.
    """
    if "riskassesment_302T" in df.columns:
        if df["riskassesment_302T"].dtype == pl.Null:
            df = df.with_columns(
                [
                    pl.Series(
                        "riskassesment_302T_rng", df["riskassesment_302T"], pl.UInt8
                    ),
                    pl.Series(
                        "riskassesment_302T_mean", df["riskassesment_302T"], pl.UInt8
                    ),
                ]
            )
        else:
            pct_low: pl.Series = (
                df["riskassesment_302T"]
                .str.split(" - ")
                .apply(lambda x: x[0].replace("%", ""))
                .cast(pl.UInt8)
            )
            pct_high: pl.Series = (
                df["riskassesment_302T"]
                .str.split(" - ")
                .apply(lambda x: x[1].replace("%", ""))
                .cast(pl.UInt8)
            )

            diff: pl.Series = pct_high - pct_low
            avg: pl.Series = ((pct_low + pct_high) / 2).cast(pl.Float32)

            del pct_high, pct_low
            gc.collect()

            df = df.with_columns(
                [
                    diff.alias("riskassesment_302T_rng"),
                    avg.alias("riskassesment_302T_mean"),
                ]
            )

        df.drop("riskassesment_302T")

    return df


def handle_dates(df: pl.DataFrame) -> pl.DataFrame:
    """
    Handles date columns in the DataFrame.

    Args:
    - df (pl.DataFrame): Input DataFrame.

    Returns:
    - pl.DataFrame: DataFrame with transformed date columns.
    """
    for col in df.columns:
        if col.endswith("D"):
            df = df.with_columns(pl.col(col) - pl.col("date_decision"))
            df = df.with_columns(pl.col(col).dt.total_days().cast(pl.Int32))

    df = df.rename(
        {
            "MONTH": "month",
            "WEEK_NUM": "week_num"
        }
    )
            
    df = df.with_columns(
        [
            pl.col("date_decision").dt.year().alias("year").cast(pl.Int16),
            pl.col("date_decision").dt.day().alias("day").cast(pl.UInt8),
        ]
    )

    return df.drop("date_decision")

### Read train data

In [7]:
# %%time

# data_store: dict = {
#     "df_base": SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_base.parquet")),
#     "depth_0": [
#         SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_static_cb_0.parquet")),
#         SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_static_0_*.parquet")),
#     ],
#     "depth_1": [
#         SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_applprev_1_*.parquet"), 1),
#         SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_tax_registry_a_1.parquet"), 1),
#         SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_tax_registry_b_1.parquet"), 1),
#         SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_tax_registry_c_1.parquet"), 1),
#         SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_a_1_*.parquet"), 1),
#         SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_b_1.parquet"), 1),
#         SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_other_1.parquet"), 1),
#         SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_person_1.parquet"), 1),
#         SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_deposit_1.parquet"), 1),
#         SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_debitcard_1.parquet"), 1),
#     ],
#     "depth_2": [
#         SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_a_2_*.parquet"), 2),
#         SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_b_2.parquet"), 2),
#         SchemaGen.scan_files(os.path.join(TEST_DIR, "train_applprev_2.parquet"), 2),
#         SchemaGen.scan_files(os.path.join(TEST_DIR, "train_person_2.parquet"), 2)
#     ],
# }

# train_df: pl.LazyFrame = (
#     SchemaGen.join_dataframes(**data_store)
#     .pipe(filter_cols)
#     .pipe(transform_cols)
#     .pipe(handle_dates)
#     .pipe(Utility.reduce_memory_usage, "train_df")
# )

# del data_store
# gc.collect()

# print(f"Train data shape: {train_df.shape}")
# display(train_df.head(10))

In [8]:
# train_df, cat_cols = Utility.to_pandas(train_df)
# cols = train_df.columns

# del train_df
# gc.collect()

In [9]:
cols = ['case_id', 'month', 'week_num', 'target', 'assignmentdate_238D', 'assignmentdate_4527235D', 'birthdate_574D', 'contractssum_5085716L', 'dateofbirth_337D', 'days120_123L', 'days180_256L', 'days30_165L', 'days360_512L', 'days90_310L', 'description_5085714M', 'education_1103M', 'education_88M', 'firstquarter_103L', 'fourthquarter_440L', 'maritalst_385M', 'maritalst_893M', 'numberofqueries_373L', 'pmtaverage_3A', 'pmtaverage_4527227A', 'pmtcount_4527229L', 'pmtcount_693L', 'pmtscount_423L', 'pmtssum_45A', 'requesttype_4525192L', 'responsedate_1012D', 'responsedate_4527233D', 'responsedate_4917613D', 'secondquarter_766L', 'thirdquarter_1082L', 'actualdpdtolerance_344P', 'amtinstpaidbefduel24m_4187115A', 'annuity_780A', 'annuitynextmonth_57A', 'applicationcnt_361L', 'applications30d_658L', 'applicationscnt_1086L', 'applicationscnt_464L', 'applicationscnt_629L', 'applicationscnt_867L', 'avgdbddpdlast24m_3658932P', 'avgdbddpdlast3m_4187120P', 'avgdbdtollast24m_4525197P', 'avgdpdtolclosure24_3658938P', 'avginstallast24m_3658937A', 'avglnamtstart24m_4525187A', 'avgmaxdpdlast9m_3716943P', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'bankacctype_710L', 'cardtype_51L', 'clientscnt12m_3712952L', 'clientscnt3m_3712950L', 'clientscnt6m_3712949L', 'clientscnt_100L', 'clientscnt_1022L', 'clientscnt_1071L', 'clientscnt_1130L', 'clientscnt_157L', 'clientscnt_257L', 'clientscnt_304L', 'clientscnt_360L', 'clientscnt_493L', 'clientscnt_533L', 'clientscnt_887L', 'clientscnt_946L', 'cntincpaycont9m_3716944L', 'cntpmts24_3658933L', 'commnoinclast6m_3546845L', 'credamount_770A', 'credtype_322L', 'currdebt_22A', 'currdebtcredtyperange_828A', 'datefirstoffer_1144D', 'datelastinstal40dpd_247D', 'datelastunpaid_3546854D', 'daysoverduetolerancedd_3976961L', 'deferredmnthsnum_166L', 'disbursedcredamount_1113A', 'disbursementtype_67L', 'downpmt_116A', 'dtlastpmtallstes_4499206D', 'eir_270L', 'equalitydataagreement_891L', 'firstclxcampaign_1125D', 'firstdatedue_489D', 'homephncnt_628L', 'inittransactionamount_650A', 'inittransactioncode_186L', 'interestrate_311L', 'isbidproduct_1095L', 'isdebitcard_729L', 'lastactivateddate_801D', 'lastapplicationdate_877D', 'lastapprcommoditycat_1041M', 'lastapprcredamount_781A', 'lastapprdate_640D', 'lastcancelreason_561M', 'lastdelinqdate_224D', 'lastrejectcommoditycat_161M', 'lastrejectcommodtypec_5251769M', 'lastrejectcredamount_222A', 'lastrejectdate_50D', 'lastrejectreason_759M', 'lastrejectreasonclient_4145040M', 'lastst_736L', 'maininc_215A', 'mastercontrelectronic_519L', 'mastercontrexist_109L', 'maxannuity_159A', 'maxdbddpdlast1m_3658939P', 'maxdbddpdtollast12m_3658940P', 'maxdbddpdtollast6m_4187119P', 'maxdebt4_972A', 'maxdpdfrom6mto36m_3546853P', 'maxdpdinstldate_3546855D', 'maxdpdinstlnum_3546846P', 'maxdpdlast12m_727P', 'maxdpdlast24m_143P', 'maxdpdlast3m_392P', 'maxdpdlast6m_474P', 'maxdpdlast9m_1059P', 'maxdpdtolerance_374P', 'maxinstallast24m_3658928A', 'maxlnamtstart6m_4525199A', 'maxoutstandbalancel12m_4187113A', 'maxpmtlast3m_4525190A', 'mindbddpdlast24m_3658935P', 'mindbdtollast24m_4525191P', 'mobilephncnt_593L', 'monthsannuity_845L', 'numactivecreds_622L', 'numactivecredschannel_414L', 'numactiverelcontr_750L', 'numcontrs3months_479L', 'numincomingpmts_3546848L', 'numinstlallpaidearly3d_817L', 'numinstls_657L', 'numinstlsallpaid_934L', 'numinstlswithdpd10_728L', 'numinstlswithdpd5_4187116L', 'numinstlswithoutdpd_562L', 'numinstmatpaidtearly2d_4499204L', 'numinstpaid_4499208L', 'numinstpaidearly3d_3546850L', 'numinstpaidearly3dest_4493216L', 'numinstpaidearly5d_1087L', 'numinstpaidearly5dest_4493211L', 'numinstpaidearly5dobd_4499205L', 'numinstpaidearly_338L', 'numinstpaidearlyest_4493214L', 'numinstpaidlastcontr_4325080L', 'numinstpaidlate1d_3546852L', 'numinstregularpaid_973L', 'numinstregularpaidest_4493210L', 'numinsttopaygr_769L', 'numinsttopaygrest_4493213L', 'numinstunpaidmax_3546851L', 'numinstunpaidmaxest_4493212L', 'numnotactivated_1143L', 'numpmtchanneldd_318L', 'numrejects9m_859L', 'opencred_647L', 'paytype1st_925L', 'paytype_783L', 'pctinstlsallpaidearl3d_427L', 'pctinstlsallpaidlat10d_839L', 'pctinstlsallpaidlate1d_3546856L', 'pctinstlsallpaidlate4d_3546849L', 'pctinstlsallpaidlate6d_3546844L', 'pmtnum_254L', 'posfpd10lastmonth_333P', 'posfpd30lastmonth_3976960P', 'posfstqpd30lastmonth_3976962P', 'price_1097A', 'sellerplacecnt_915L', 'sellerplacescnt_216L', 'sumoutstandtotal_3546847A', 'sumoutstandtotalest_4493215A', 'totaldebt_9A', 'totalsettled_863A', 'totinstallast1m_4525188A', 'twobodfilling_608L', 'typesuite_864L', 'validfrom_1069D', 'max_actualdpd_943P', 'max_annuity_853A', 'max_approvaldate_319D', 'max_byoccupationinc_3656910L', 'max_cancelreason_3545846M', 'max_childnum_21L', 'max_creationdate_885D', 'max_credacc_actualbalance_314A', 'max_credacc_credlmt_575A', 'max_credacc_maxhisbal_375A', 'max_credacc_minhisbal_90A', 'max_credacc_status_367L', 'max_credacc_transactions_402L', 'max_credamount_590A', 'max_credtype_587L', 'max_currdebt_94A', 'max_dateactivated_425D', 'max_downpmt_134A', 'max_dtlastpmt_581D', 'max_dtlastpmtallstes_3545839D', 'max_education_1138M', 'max_employedfrom_700D', 'max_familystate_726L', 'max_firstnonzeroinstldate_307D', 'max_inittransactioncode_279L', 'max_isbidproduct_390L', 'max_isdebitcard_527L', 'max_mainoccupationinc_437A', 'max_maxdpdtolerance_577P', 'max_num_group1', 'max_outstandingdebt_522A', 'max_pmtnum_8L', 'max_postype_4733339M', 'max_rejectreason_755M', 'max_rejectreasonclient_4145042M', 'max_revolvingaccount_394A', 'max_status_219L', 'max_tenor_203L', 'last_actualdpd_943P', 'last_annuity_853A', 'last_approvaldate_319D', 'last_byoccupationinc_3656910L', 'last_cancelreason_3545846M', 'last_childnum_21L', 'last_creationdate_885D', 'last_credacc_actualbalance_314A', 'last_credacc_credlmt_575A', 'last_credacc_maxhisbal_375A', 'last_credacc_minhisbal_90A', 'last_credacc_status_367L', 'last_credacc_transactions_402L', 'last_credamount_590A', 'last_credtype_587L', 'last_currdebt_94A', 'last_dateactivated_425D', 'last_downpmt_134A', 'last_dtlastpmt_581D', 'last_dtlastpmtallstes_3545839D', 'last_education_1138M', 'last_employedfrom_700D', 'last_familystate_726L', 'last_firstnonzeroinstldate_307D', 'last_inittransactioncode_279L', 'last_isbidproduct_390L', 'last_mainoccupationinc_437A', 'last_maxdpdtolerance_577P', 'last_num_group1', 'last_outstandingdebt_522A', 'last_pmtnum_8L', 'last_postype_4733339M', 'last_rejectreason_755M', 'last_rejectreasonclient_4145042M', 'last_status_219L', 'last_tenor_203L', 'mean_actualdpd_943P', 'mean_annuity_853A', 'mean_approvaldate_319D', 'mean_creationdate_885D', 'mean_credacc_actualbalance_314A', 'mean_credacc_credlmt_575A', 'mean_credacc_maxhisbal_375A', 'mean_credacc_minhisbal_90A', 'mean_credamount_590A', 'mean_currdebt_94A', 'mean_dateactivated_425D', 'mean_downpmt_134A', 'mean_dtlastpmt_581D', 'mean_dtlastpmtallstes_3545839D', 'mean_employedfrom_700D', 'mean_firstnonzeroinstldate_307D', 'mean_mainoccupationinc_437A', 'mean_maxdpdtolerance_577P', 'mean_outstandingdebt_522A', 'mean_revolvingaccount_394A', 'var_actualdpd_943P', 'var_annuity_853A', 'var_credacc_credlmt_575A', 'var_credamount_590A', 'var_currdebt_94A', 'var_downpmt_134A', 'var_mainoccupationinc_437A', 'var_maxdpdtolerance_577P', 'var_outstandingdebt_522A', 'max_amount_4527230A', 'max_num_group1_3', 'max_recorddate_4527225D', 'last_amount_4527230A', 'last_num_group1_3', 'last_recorddate_4527225D', 'mean_amount_4527230A', 'mean_recorddate_4527225D', 'var_amount_4527230A', 'max_amount_4917619A', 'max_deductiondate_4917603D', 'max_num_group1_4', 'last_amount_4917619A', 'last_deductiondate_4917603D', 'last_num_group1_4', 'mean_amount_4917619A', 'mean_deductiondate_4917603D', 'var_amount_4917619A', 'max_num_group1_5', 'max_pmtamount_36A', 'max_processingdate_168D', 'last_num_group1_5', 'last_pmtamount_36A', 'last_processingdate_168D', 'mean_pmtamount_36A', 'mean_processingdate_168D', 'var_pmtamount_36A', 'max_annualeffectiverate_199L', 'max_annualeffectiverate_63L', 'max_classificationofcontr_13M', 'max_classificationofcontr_400M', 'max_contractst_545M', 'max_contractst_964M', 'max_contractsum_5085717L', 'max_credlmt_230A', 'max_credlmt_935A', 'max_dateofcredend_289D', 'max_dateofcredend_353D', 'max_dateofcredstart_181D', 'max_dateofcredstart_739D', 'max_dateofrealrepmt_138D', 'max_debtoutstand_525A', 'max_debtoverdue_47A', 'max_description_351M', 'max_dpdmax_139P', 'max_dpdmax_757P', 'max_dpdmaxdatemonth_442T', 'max_dpdmaxdatemonth_89T', 'max_dpdmaxdateyear_596T', 'max_dpdmaxdateyear_896T', 'max_financialinstitution_382M', 'max_financialinstitution_591M', 'max_instlamount_768A', 'max_instlamount_852A', 'max_lastupdate_1112D', 'max_lastupdate_388D', 'max_monthlyinstlamount_332A', 'max_monthlyinstlamount_674A', 'max_nominalrate_281L', 'max_nominalrate_498L', 'max_num_group1_6', 'max_numberofcontrsvalue_258L', 'max_numberofcontrsvalue_358L', 'max_numberofinstls_229L', 'max_numberofinstls_320L', 'max_numberofoutstandinstls_520L', 'max_numberofoutstandinstls_59L', 'max_numberofoverdueinstlmax_1039L', 'max_numberofoverdueinstlmax_1151L', 'max_numberofoverdueinstlmaxdat_148D', 'max_numberofoverdueinstlmaxdat_641D', 'max_numberofoverdueinstls_725L', 'max_numberofoverdueinstls_834L', 'max_outstandingamount_354A', 'max_outstandingamount_362A', 'max_overdueamount_31A', 'max_overdueamount_659A', 'max_overdueamountmax2_14A', 'max_overdueamountmax2_398A', 'max_overdueamountmax2date_1002D', 'max_overdueamountmax2date_1142D', 'max_overdueamountmax_155A', 'max_overdueamountmax_35A', 'max_overdueamountmaxdatemonth_284T', 'max_overdueamountmaxdatemonth_365T', 'max_overdueamountmaxdateyear_2T', 'max_overdueamountmaxdateyear_994T', 'max_periodicityofpmts_1102L', 'max_periodicityofpmts_837L', 'max_prolongationcount_1120L', 'max_purposeofcred_426M', 'max_purposeofcred_874M', 'max_refreshdate_3813885D', 'max_residualamount_488A', 'max_residualamount_856A', 'max_subjectrole_182M', 'max_subjectrole_93M', 'max_totalamount_6A', 'max_totalamount_996A', 'max_totaldebtoverduevalue_178A', 'max_totaldebtoverduevalue_718A', 'max_totaloutstanddebtvalue_39A', 'max_totaloutstanddebtvalue_668A', 'last_classificationofcontr_13M', 'last_classificationofcontr_400M', 'last_contractst_545M', 'last_contractst_964M', 'last_description_351M', 'last_financialinstitution_382M', 'last_financialinstitution_591M', 'last_num_group1_6', 'last_purposeofcred_426M', 'last_purposeofcred_874M', 'last_refreshdate_3813885D', 'last_subjectrole_182M', 'last_subjectrole_93M', 'mean_credlmt_230A', 'mean_credlmt_935A', 'mean_dateofcredend_289D', 'mean_dateofcredend_353D', 'mean_dateofcredstart_181D', 'mean_dateofcredstart_739D', 'mean_dateofrealrepmt_138D', 'mean_debtoutstand_525A', 'mean_debtoverdue_47A', 'mean_dpdmax_139P', 'mean_dpdmax_757P', 'mean_instlamount_768A', 'mean_instlamount_852A', 'mean_lastupdate_1112D', 'mean_lastupdate_388D', 'mean_monthlyinstlamount_332A', 'mean_monthlyinstlamount_674A', 'mean_numberofoverdueinstlmaxdat_148D', 'mean_numberofoverdueinstlmaxdat_641D', 'mean_outstandingamount_354A', 'mean_outstandingamount_362A', 'mean_overdueamount_31A', 'mean_overdueamount_659A', 'mean_overdueamountmax2_14A', 'mean_overdueamountmax2_398A', 'mean_overdueamountmax2date_1002D', 'mean_overdueamountmax2date_1142D', 'mean_overdueamountmax_155A', 'mean_overdueamountmax_35A', 'mean_refreshdate_3813885D', 'mean_residualamount_488A', 'mean_residualamount_856A', 'mean_totalamount_6A', 'mean_totalamount_996A', 'mean_totaldebtoverduevalue_178A', 'mean_totaldebtoverduevalue_718A', 'mean_totaloutstanddebtvalue_39A', 'mean_totaloutstanddebtvalue_668A', 'var_credlmt_230A', 'var_credlmt_935A', 'var_dpdmax_139P', 'var_dpdmax_757P', 'var_instlamount_768A', 'var_instlamount_852A', 'var_monthlyinstlamount_332A', 'var_monthlyinstlamount_674A', 'var_outstandingamount_354A', 'var_outstandingamount_362A', 'var_overdueamount_31A', 'var_overdueamount_659A', 'var_overdueamountmax2_14A', 'var_overdueamountmax2_398A', 'var_overdueamountmax_155A', 'var_overdueamountmax_35A', 'var_residualamount_488A', 'var_residualamount_856A', 'var_totalamount_6A', 'var_totalamount_996A', 'max_birth_259D', 'max_contaddr_matchlist_1032L', 'max_contaddr_smempladdr_334L', 'max_education_927M', 'max_empl_employedfrom_271D', 'max_empl_employedtotal_800L', 'max_empl_industry_691L', 'max_empladdr_district_926M', 'max_empladdr_zipcode_114M', 'max_familystate_447L', 'max_housetype_905L', 'max_incometype_1044T', 'max_language1_981M', 'max_mainoccupationinc_384A', 'max_num_group1_9', 'max_personindex_1023L', 'max_persontype_1072L', 'max_persontype_792L', 'max_relationshiptoclient_415T', 'max_relationshiptoclient_642T', 'max_remitter_829L', 'max_role_1084L', 'max_safeguarantyflag_411L', 'max_sex_738L', 'max_type_25L', 'last_birth_259D', 'last_contaddr_matchlist_1032L', 'last_contaddr_smempladdr_334L', 'last_education_927M', 'last_empladdr_district_926M', 'last_empladdr_zipcode_114M', 'last_incometype_1044T', 'last_language1_981M', 'last_mainoccupationinc_384A', 'last_num_group1_9', 'last_personindex_1023L', 'last_persontype_1072L', 'last_persontype_792L', 'last_relationshiptoclient_415T', 'last_relationshiptoclient_642T', 'last_remitter_829L', 'last_role_1084L', 'last_safeguarantyflag_411L', 'last_sex_738L', 'last_type_25L', 'mean_birth_259D', 'mean_empl_employedfrom_271D', 'mean_mainoccupationinc_384A', 'max_amount_416A', 'max_num_group1_10', 'max_openingdate_313D', 'last_amount_416A', 'last_num_group1_10', 'last_openingdate_313D', 'mean_amount_416A', 'mean_openingdate_313D', 'max_num_group1_11', 'max_openingdate_857D', 'last_num_group1_11', 'last_openingdate_857D', 'mean_openingdate_857D', 'max_collater_typofvalofguarant_298M', 'max_collater_typofvalofguarant_407M', 'max_collater_valueofguarantee_1124L', 'max_collater_valueofguarantee_876L', 'max_collaterals_typeofguarante_359M', 'max_collaterals_typeofguarante_669M', 'max_num_group1_12', 'max_num_group2', 'max_pmts_dpd_1073P', 'max_pmts_dpd_303P', 'max_pmts_month_158T', 'max_pmts_month_706T', 'max_pmts_overdue_1140A', 'max_pmts_overdue_1152A', 'max_pmts_year_1139T', 'max_pmts_year_507T', 'max_subjectroles_name_541M', 'max_subjectroles_name_838M', 'last_collater_typofvalofguarant_298M', 'last_collater_typofvalofguarant_407M', 'last_collaterals_typeofguarante_359M', 'last_collaterals_typeofguarante_669M', 'last_num_group1_12', 'last_num_group2', 'last_pmts_month_158T', 'last_pmts_month_706T', 'last_pmts_year_1139T', 'last_pmts_year_507T', 'last_subjectroles_name_541M', 'last_subjectroles_name_838M', 'mean_pmts_dpd_1073P', 'mean_pmts_dpd_303P', 'mean_pmts_overdue_1140A', 'mean_pmts_overdue_1152A', 'var_pmts_dpd_1073P', 'var_pmts_dpd_303P', 'var_pmts_overdue_1140A', 'var_pmts_overdue_1152A', 'year', 'day']
cat_cols = ['description_5085714M', 'education_1103M', 'education_88M', 'maritalst_385M', 'maritalst_893M', 'requesttype_4525192L', 'bankacctype_710L', 'cardtype_51L', 'credtype_322L', 'disbursementtype_67L', 'equalitydataagreement_891L', 'inittransactioncode_186L', 'isdebitcard_729L', 'lastapprcommoditycat_1041M', 'lastcancelreason_561M', 'lastrejectcommoditycat_161M', 'lastrejectcommodtypec_5251769M', 'lastrejectreason_759M', 'lastrejectreasonclient_4145040M', 'lastst_736L', 'opencred_647L', 'paytype1st_925L', 'paytype_783L', 'twobodfilling_608L', 'typesuite_864L', 'max_cancelreason_3545846M', 'max_credacc_status_367L', 'max_credtype_587L', 'max_education_1138M', 'max_familystate_726L', 'max_inittransactioncode_279L', 'max_isbidproduct_390L', 'max_isdebitcard_527L', 'max_postype_4733339M', 'max_rejectreason_755M', 'max_rejectreasonclient_4145042M', 'max_status_219L', 'last_cancelreason_3545846M', 'last_credacc_status_367L', 'last_credtype_587L', 'last_education_1138M', 'last_familystate_726L', 'last_inittransactioncode_279L', 'last_isbidproduct_390L', 'last_postype_4733339M', 'last_rejectreason_755M', 'last_rejectreasonclient_4145042M', 'last_status_219L', 'max_classificationofcontr_13M', 'max_classificationofcontr_400M', 'max_contractst_545M', 'max_contractst_964M', 'max_description_351M', 'max_financialinstitution_382M', 'max_financialinstitution_591M', 'max_purposeofcred_426M', 'max_purposeofcred_874M', 'max_subjectrole_182M', 'max_subjectrole_93M', 'last_classificationofcontr_13M', 'last_classificationofcontr_400M', 'last_contractst_545M', 'last_contractst_964M', 'last_description_351M', 'last_financialinstitution_382M', 'last_financialinstitution_591M', 'last_purposeofcred_426M', 'last_purposeofcred_874M', 'last_subjectrole_182M', 'last_subjectrole_93M', 'max_contaddr_matchlist_1032L', 'max_contaddr_smempladdr_334L', 'max_education_927M', 'max_empl_employedtotal_800L', 'max_empl_industry_691L', 'max_empladdr_district_926M', 'max_empladdr_zipcode_114M', 'max_familystate_447L', 'max_housetype_905L', 'max_incometype_1044T', 'max_language1_981M', 'max_relationshiptoclient_415T', 'max_relationshiptoclient_642T', 'max_remitter_829L', 'max_role_1084L', 'max_safeguarantyflag_411L', 'max_sex_738L', 'max_type_25L', 'last_contaddr_matchlist_1032L', 'last_contaddr_smempladdr_334L', 'last_education_927M', 'last_empladdr_district_926M', 'last_empladdr_zipcode_114M', 'last_incometype_1044T', 'last_language1_981M', 'last_relationshiptoclient_415T', 'last_relationshiptoclient_642T', 'last_remitter_829L', 'last_role_1084L', 'last_safeguarantyflag_411L', 'last_sex_738L', 'last_type_25L', 'max_collater_typofvalofguarant_298M', 'max_collater_typofvalofguarant_407M', 'max_collaterals_typeofguarante_359M', 'max_collaterals_typeofguarante_669M', 'max_subjectroles_name_541M', 'max_subjectroles_name_838M', 'last_collater_typofvalofguarant_298M', 'last_collater_typofvalofguarant_407M', 'last_collaterals_typeofguarante_359M', 'last_collaterals_typeofguarante_669M', 'last_subjectroles_name_541M', 'last_subjectroles_name_838M']

### Read test data

In [10]:
data_store = {
    "df_base": SchemaGen.scan_files(os.path.join(TEST_DIR, "test_base.parquet")),
    "depth_0": [
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_static_cb_0.parquet")),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_static_0_*.parquet")),
    ],
    "depth_1": [
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_applprev_1_*.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_tax_registry_a_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_tax_registry_b_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_tax_registry_c_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_credit_bureau_a_1_*.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_credit_bureau_b_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_other_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_person_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_deposit_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_debitcard_1.parquet"), 1),
    ],
    "depth_2": [
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_credit_bureau_a_2_*.parquet"), 2),
        SchemaGen.scan_files(os.path.join(TEST_DIR, "test_credit_bureau_b_2.parquet"), 2),
        # SchemaGen.scan_files(os.path.join(TEST_DIR, "test_applprev_2.parquet"), 2),
        # SchemaGen.scan_files(os.path.join(TEST_DIR, "test_person_2.parquet"), 2)
    ],
}

test_df: pl.DataFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(transform_cols)
    .pipe(handle_dates)
    .select([v for v in cols if v != "target"])
    .pipe(Utility.reduce_memory_usage, "test_df")
)

del data_store
gc.collect()

print(f"Test data shape: {test_df.shape}")

File test_base loaded into memory.
File test_static_cb_0 loaded into memory.
File test_static_0_0 loaded into memory.
File test_static_0_2 loaded into memory.
File test_static_0_1 loaded into memory.
File test_applprev_1_2 loaded into memory.
File test_applprev_1_0 loaded into memory.
File test_applprev_1_1 loaded into memory.
File test_tax_registry_a_1 loaded into memory.
File test_tax_registry_b_1 loaded into memory.
File test_tax_registry_c_1 loaded into memory.
File test_credit_bureau_a_1_3 loaded into memory.
File test_credit_bureau_a_1_2 loaded into memory.
File test_credit_bureau_a_1_1 loaded into memory.
File test_credit_bureau_a_1_4 loaded into memory.
File test_credit_bureau_a_1_0 loaded into memory.
File test_credit_bureau_b_1 loaded into memory.
File test_other_1 loaded into memory.
File test_person_1 loaded into memory.
File test_deposit_1 loaded into memory.
File test_debitcard_1 loaded into memory.
File test_credit_bureau_a_2_3 loaded into memory.
File test_credit_bureau

In [11]:
test_df, cat_cols = Utility.to_pandas(test_df, cat_cols)

In [12]:
test_df

Unnamed: 0,case_id,month,week_num,assignmentdate_238D,assignmentdate_4527235D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,days120_123L,days180_256L,...,mean_pmts_dpd_1073P,mean_pmts_dpd_303P,mean_pmts_overdue_1140A,mean_pmts_overdue_1152A,var_pmts_dpd_1073P,var_pmts_dpd_303P,var_pmts_overdue_1140A,var_pmts_overdue_1152A,year,day
0,57551,202201,100,,,,2926195.0,-14090,1.0,3.0,...,,,,,,,,,2020,27
1,57549,202201,100,,,,1563078.0,-22723,6.0,9.0,...,,,,,,,,,2022,17
2,57630,202201,100,,,,499975.0,-19767,1.0,2.0,...,0.0,0.0,0.0,0.0,,,,,2021,16
3,57569,202201,100,,,,,-26408,4.0,4.0,...,,2328.571533,,33346.402344,,3341.619141,,0.0,2021,20
4,57631,202201,100,,,,480334.5,-12999,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022,4
5,57633,202201,100,,,,6373008.0,-10496,3.0,3.0,...,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,2022,25
6,57552,202201,100,,,,747031.8,-23768,2.0,2.0,...,,,,,,,,,2020,27
7,57632,202201,100,,,,17677.0,-23107,1.0,2.0,...,0.0,,0.0,,0.0,,0.0,,2022,5
8,57543,202201,100,,,,151364.0,-14804,2.0,4.0,...,0.0,,0.0,,,,,,2021,14
9,57634,202201,100,,,,15263.65,-16281,2.0,2.0,...,,,,,,,,,2021,27


### Inference

In [13]:
with open('/kaggle/input/home-credit-crms-models/voting_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

In [14]:
submission_df: pd.DataFrame = pd.read_csv(os.path.join(ROOT, "sample_submission.csv"))
submission_df = submission_df.set_index("case_id")

X_test = test_df.drop(columns=["week_num"]).set_index("case_id")
X_test[cat_cols] = X_test[cat_cols].astype("category")

y_pred = pd.Series(loaded_model.predict_proba(X_test)[:, 1], index=X_test.index)
submission_df["score"] = y_pred

# Metric Hack
# https://www.kaggle.com/code/andreasbis/how-far-can-you-go-with-cheating
# condition = y_pred < 0.996
# SHIFT = 0.05
# submission_df.loc[condition, "score"] = (submission_df.loc[condition, "score"] - SHIFT).clip(0)

submission_df

Unnamed: 0_level_0,score
case_id,Unnamed: 1_level_1
57543,0.010411
57549,0.05226
57551,0.002314
57552,0.018727
57569,0.189517
57630,0.007126
57631,0.022349
57632,0.005105
57633,0.036057
57634,0.019069


In [15]:
# test_cp_df = test_df.drop(columns=["week_num"]).reset_index()

# # Restore WEEK_NUM
# credit_a = SchemaGen.scan_files(os.path.join(TEST_DIR, "test_credit_bureau_a_1_*.parquet"), 1)

# max_dates = (credit_a
# .select([pl.col("case_id"), pl.col("refreshdate_3813885D").str.to_datetime()])
# .group_by("case_id")
# .max()
# ).collect().to_pandas().set_index("case_id").refreshdate_3813885D.sort_index()

# predict_dates = max_dates - pd.to_timedelta(str(14) + " days")

# date_min = test_cp_df.refreshdate_3813885D.min()
# date_max = test_cp_df.refreshdate_3813885D.max()
# first_monday = test_cp_df.refreshdate_3813885D[test_df.refreshdate_3813885D.dt.day_of_week==1].min()

# day_between = (date_max - first_monday).days
# day_range = [first_monday + pd.to_timedelta(str(i) + " days") for i in range(0, day_between, 7)]
# test_cp_df["WEEK_NUM"] = 0

# for i in range(-1, len(day_range)):
#     if i < 0 and test_cp_df.refreshdate_3813885D.min() != first_monday:
#         test_cp_df["WEEK_NUM"] += ~test_cp_df.refreshdate_3813885D.isna()
#         continue


#     test_cp_df["WEEK_NUM"] += test_cp_df.refreshdate_3813885D >= day_range[i]

# # 予測結果にjoin
# submission_df = submission_df.join(test_cp_df["WEEK_NUM"], how="left")

# # METRIC HACK
# condition = submission_df['WEEK_NUM'] < (submission_df['WEEK_NUM'].max() - submission_df['WEEK_NUM'].min())/2 + submission_df['WEEK_NUM'].min()
# submission_df.loc[condition, 'score'] = (submission_df.loc[condition, 'score'] - 0.02).clip(0)
# submission_df = submission_df[["case_id","score"]]
# submission_df = submission_df.set_index("case_id")

In [16]:
submission_df.to_csv("submission.csv")