## Baseline - Infer

※ Kaggle notebookのみで動かす

https://www.kaggle.com/code/pxcai666/catboost-lightgbm-ensemble/notebook

### Configuration

In [1]:
import sys
from pathlib import Path
import subprocess
import os
import gc
from glob import glob
import pickle

import numpy as np
import pandas as pd
import polars as pl
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit, GroupKFold, StratifiedGroupKFold
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier, Pool
import lightgbm as lgb

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer

In [2]:
# Kaggle
sys.path.append('/kaggle/input/home-credit-crms-repo')

In [3]:
ROOT = "/kaggle/input/home-credit-credit-risk-model-stability"

TRAIN_DIR = os.path.join(ROOT, "parquet_files", "train")
TEST_DIR = os.path.join(ROOT, "parquet_files", "test")

### Def utility classes

In [4]:
from src.utils.utility import Utility
from src.utils.schema_gen import SchemaGen
from src.utils.pipeline import Pipeline
from src.utils.aggregator import Aggregator

### Read train data

In [5]:
%%time

data_store: dict = {
    "df_base": SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_base.parquet")),
    "depth_0": [
#         SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_static_cb_0.parquet")),
#         SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_static_0_*.parquet")),
    ],
    "depth_1": [
#         SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_applprev_1_*.parquet"), 1),
#         SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_tax_registry_a_1.parquet"), 1),
#         SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_tax_registry_b_1.parquet"), 1),
#         SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_tax_registry_c_1.parquet"), 1),
#         SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_a_1_*.parquet"), 1),
#         SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_b_1.parquet"), 1),
#         SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_other_1.parquet"), 1),
#         SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_person_1.parquet"), 1),
#         SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_deposit_1.parquet"), 1),
#         SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_debitcard_1.parquet"), 1),
    ],
    "depth_2": [
#         SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_a_2_*.parquet"), 2),
#         SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_b_2.parquet"), 2),
    ],
}

train_df: pl.LazyFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(Pipeline.filter_cols)
    .pipe(Pipeline.transform_cols)
    .pipe(Pipeline.handle_dates)
    .pipe(Utility.reduce_memory_usage, "train_df")
)

del data_store
gc.collect()

print(f"Train data shape: {train_df.shape}")
display(train_df.head(10))

File train_base loaded into memory.
Memory usage of dataframe "train_df" is 37.8543 MB.
Memory usage of dataframe "train_df" became 20.3831 MB.
Memory usage of dataframe "train_df" is 18.9272 MB.
Memory usage of dataframe "train_df" became 18.9272 MB.
Train data shape: (1526659, 6)


case_id,month,week_num,target,year,day
u32,u32,u8,u8,u16,u8
20374,201906,25,0,2019,26
1499960,201908,33,0,2019,22
1885745,202008,82,0,2020,1
1367254,201905,19,0,2019,15
1816937,202003,62,0,2020,15
1339940,201904,14,0,2019,14
1388431,201906,22,0,2019,5
1878915,202007,79,0,2020,13
1312921,201903,10,0,2019,18
1337701,201904,14,0,2019,12


CPU times: user 1.22 s, sys: 330 ms, total: 1.55 s
Wall time: 1.16 s


In [6]:
cols = train_df.columns
train_df, cat_cols = Utility.to_pandas(train_df)

del train_df
gc.collect()

0

### Read test data

In [7]:
data_store = {
    "df_base": SchemaGen.scan_files(os.path.join(TEST_DIR, "test_base.parquet")),
    "depth_0": [
#         SchemaGen.scan_files(os.path.join(TEST_DIR, "test_static_cb_0.parquet")),
#         SchemaGen.scan_files(os.path.join(TEST_DIR, "test_static_0_*.parquet")),
    ],
    "depth_1": [
#         SchemaGen.scan_files(os.path.join(TEST_DIR, "test_applprev_1_*.parquet"), 1),
#         SchemaGen.scan_files(os.path.join(TEST_DIR, "test_tax_registry_a_1.parquet"), 1),
#         SchemaGen.scan_files(os.path.join(TEST_DIR, "test_tax_registry_b_1.parquet"), 1),
#         SchemaGen.scan_files(os.path.join(TEST_DIR, "test_tax_registry_c_1.parquet"), 1),
#         SchemaGen.scan_files(os.path.join(TEST_DIR, "test_credit_bureau_a_1_*.parquet"), 1),
#         SchemaGen.scan_files(os.path.join(TEST_DIR, "test_credit_bureau_b_1.parquet"), 1),
#         SchemaGen.scan_files(os.path.join(TEST_DIR, "test_other_1.parquet"), 1),
#         SchemaGen.scan_files(os.path.join(TEST_DIR, "test_person_1.parquet"), 1),
#         SchemaGen.scan_files(os.path.join(TEST_DIR, "test_deposit_1.parquet"), 1),
#         SchemaGen.scan_files(os.path.join(TEST_DIR, "test_debitcard_1.parquet"), 1),
    ],
    "depth_2": [
#         SchemaGen.scan_files(os.path.join(TEST_DIR, "test_credit_bureau_b_2.parquet"), 2),
#         SchemaGen.scan_files(os.path.join(TEST_DIR, "test_credit_bureau_a_2_*.parquet"), 2),
#         # SchemaGen.scan_files(os.path.join(TEST_DIR, "test_applprev_2.parquet"), 2),
#         # SchemaGen.scan_files(os.path.join(TEST_DIR, "test_person_2.parquet"), 2)
    ],
}

test_df: pl.DataFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(Pipeline.transform_cols)
    .pipe(Pipeline.handle_dates)
    .select([col for col in cols if col != "target"])
    .pipe(Utility.reduce_memory_usage, "test_df")
)

del data_store
gc.collect()

print(f"Test data shape: {test_df.shape}")

File test_base loaded into memory.
Memory usage of dataframe "train_df" is 0.0002 MB.
Memory usage of dataframe "train_df" became 0.0001 MB.
Memory usage of dataframe "test_df" is 0.0001 MB.
Memory usage of dataframe "test_df" became 0.0001 MB.
Test data shape: (10, 5)


In [8]:
test_df, cat_cols = Utility.to_pandas(test_df, cat_cols)

### Inference

In [9]:
X_test = test_df.drop(columns=["week_num"]).set_index("case_id")
X_test[cat_cols] = X_test[cat_cols].astype("category")
# X_test[cat_cols] = X_test[cat_cols].apply(lambda x: x.cat.add_categories(['NaN']).fillna('NaN'))

# preds = pd.Series(loaded_model.predict_proba(test_df)[:, 1], index=test_df.index)

submission_df = pd.DataFrame(index=test_df.index)
# submission_df["score"] = preds
submission_df["score"] = 0.1

# Metric Hack
# https://www.kaggle.com/code/a520hh/fork-of-this-is-the-way/notebook
# condition = preds < 0.98
# submission_df.loc[condition, "score"] = (submission_df.loc[condition, "score"] - 0.073).clip(0)

submission_df

Unnamed: 0,score
0,0.1
1,0.1
2,0.1
3,0.1
4,0.1
5,0.1
6,0.1
7,0.1
8,0.1
9,0.1


In [10]:
submission_df.to_csv("submission.csv")