In [1]:
import os
import gc
from glob import glob
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
import polars as pl

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedGroupKFold
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import roc_auc_score

import joblib

import lightgbm as lgb

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)



In [2]:
class VotingModel(BaseEstimator, RegressorMixin):
    """Voting ensemble model."""
    def __init__(self, estimators):
        """Initialize the VotingModel with a list of base estimators."""
        self.estimators = estimators
        
    def fit(self, X, y=None):
        """Fit the base estimators."""
        return self
    
    def predict(self, X):
        """Make predictions using the base estimators and return the average."""
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)
    
    def predict_proba(self, X):
        """Make probability predictions using the base estimators and return the average."""
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

In [3]:
class Pipeline:
    """Data preprocessing pipeline."""
    @staticmethod
    def set_table_dtypes(df):
        """Set data types for columns."""
        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int64))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
            elif col[-1] in ("D",):
                df = df.with_columns(pl.col(col).cast(pl.Date))            

        return df
    
    @staticmethod
    def handle_dates(df):
        """Handle date columns."""
        for col in df.columns:
            if col[-1] in ("D",):
                df = df.with_columns(pl.col(col) - pl.col("date_decision"))
                df = df.with_columns(pl.col(col).dt.total_days())
                
        df = df.drop("date_decision", "MONTH")

        return df
    
    @staticmethod
    def filter_cols(df):
        """Filter columns based on missing values and other criteria."""
        for col in df.columns:
            if col not in ["target", "case_id", "WEEK_NUM"]:
                isnull = df[col].is_null().mean()

                if isnull > 0.95:
                    df = df.drop(col)

        for col in df.columns:
            if (col not in ["target", "case_id", "WEEK_NUM"]) & (df[col].dtype == pl.String):
                freq = df[col].n_unique()

                if (freq == 1) | (freq > 200):
                    df = df.drop(col)

        return df

In [4]:
class Aggregator:
    """Feature aggregation functions."""
    @staticmethod
    def num_expr(df):
        cols = [col for col in df.columns if col[-1] in ("P", "A")]

        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]

        return expr_max

    @staticmethod
    def date_expr(df):
        cols = [col for col in df.columns if col[-1] in ("D",)]

        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]

        return expr_max

    @staticmethod
    def str_expr(df):
        cols = [col for col in df.columns if col[-1] in ("M",)]
        
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]

        return expr_max

    @staticmethod
    def other_expr(df):
        cols = [col for col in df.columns if col[-1] in ("T", "L")]
        
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]

        return expr_max
    
    @staticmethod
    def count_expr(df):
        cols = [col for col in df.columns if "num_group" in col]

        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]

        return expr_max

    @staticmethod
    def get_exprs(df):
        exprs = Aggregator.num_expr(df) + \
                Aggregator.date_expr(df) + \
                Aggregator.str_expr(df) + \
                Aggregator.other_expr(df) + \
                Aggregator.count_expr(df)

        return exprs

In [5]:
def read_file(path, depth=None):
    """Read a single parquet file."""
    df = pl.read_parquet(path)
    df = df.pipe(Pipeline.set_table_dtypes)
    
    if depth in [1, 2]:
        df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
    
    return df

def read_files(regex_path, depth=None):
    """Read multiple parquet files."""
    chunks = []
    for path in glob(str(regex_path)):
        chunks.append(pl.read_parquet(path).pipe(Pipeline.set_table_dtypes))
        
    df = pl.concat(chunks, how="vertical_relaxed")
    if depth in [1, 2]:
        df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
    
    return df

In [6]:
def feature_eng(df_base, depth_0, depth_1, depth_2):
    """Feature engineering pipeline."""
    df_base = (
        df_base
        .with_columns(
            month_decision = pl.col("date_decision").dt.month(),
            weekday_decision = pl.col("date_decision").dt.weekday(),
        )
    )
        
    for i, df in enumerate(depth_0 + depth_1 + depth_2):
        df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
        
    df_base = df_base.pipe(Pipeline.handle_dates)
    
    return df_base

In [7]:
def to_pandas(df_data, cat_cols=None):
    """Convert Polars DataFrame to Pandas DataFrame."""
    df_data = df_data.to_pandas()
    
    if cat_cols is None:
        cat_cols = list(df_data.select_dtypes("object").columns)
    
    df_data[cat_cols] = df_data[cat_cols].astype("category")
    
    return df_data, cat_cols

In [8]:
def train_models(X_train, y_train, weeks, cv):
    """Train LightGBM models with cross-validation."""
    params = {
        "boosting_type": "gbdt",
        "objective": "binary",
        "metric": "auc",
        "max_depth": 8,
        "learning_rate": 0.05,
        "n_estimators": 1000,
        "colsample_bytree": 0.8, 
        "colsample_bynode": 0.8,
        "verbose": -1,
        "random_state": 42,
        "device": "gpu",
    }

    fitted_models = []

    for idx_train, idx_valid in cv.split(X_train, y_train, groups=weeks):
        X_train_fold, y_train_fold = X_train.iloc[idx_train], y_train.iloc[idx_train]
        X_valid_fold, y_valid_fold = X_train.iloc[idx_valid], y_train.iloc[idx_valid]

        model = lgb.LGBMClassifier(**params)
        model.fit(
            X_train_fold, y_train_fold,
            eval_set=[(X_valid_fold, y_valid_fold)],
            callbacks=[lgb.log_evaluation(100), lgb.early_stopping(100)]
        )

        fitted_models.append(model)

    return fitted_models

In [9]:
def evaluate_models(models, X_test, y_test):
    """Evaluate models on test data and return evaluation metrics."""
    y_preds = np.mean([model.predict_proba(X_test)[:, 1] for model in models], axis=0)
    auc = roc_auc_score(y_test, y_preds)
    return auc

def save_model(model, filepath):
    """Save the trained model to a file."""
    joblib.dump(model, filepath)

In [10]:
ROOT = Path("/kaggle/input/home-credit-credit-risk-model-stability")
TRAIN_DIR = ROOT / "parquet_files" / "train"
TEST_DIR = ROOT / "parquet_files" / "test"

In [11]:
# Data loading and preprocessing
# Implement data loading, preprocessing, and feature engineering steps

data_store = {
    "df_base": read_file(TRAIN_DIR / "train_base.parquet"),
    "depth_0": [
        read_file(TRAIN_DIR / "train_static_cb_0.parquet"),
        read_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_c_1.parquet", 1),
        read_file(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_other_1.parquet", 1),
        read_file(TRAIN_DIR / "train_person_1.parquet", 1),
        read_file(TRAIN_DIR / "train_deposit_1.parquet", 1),
        read_file(TRAIN_DIR / "train_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
    ]
}
df_train = feature_eng(**data_store)

In [12]:
df_train.shape

(1526659, 376)

In [12]:
data_store = {
    "df_base": read_file(TEST_DIR / "test_base.parquet"),
    "depth_0": [
        read_file(TEST_DIR / "test_static_cb_0.parquet"),
        read_files(TEST_DIR / "test_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TEST_DIR / "test_applprev_1_*.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_a_1.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_b_1.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_c_1.parquet", 1),
        read_file(TEST_DIR / "test_credit_bureau_b_1.parquet", 1),
        read_file(TEST_DIR / "test_other_1.parquet", 1),
        read_file(TEST_DIR / "test_person_1.parquet", 1),
        read_file(TEST_DIR / "test_deposit_1.parquet", 1),
        read_file(TEST_DIR / "test_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TEST_DIR / "test_credit_bureau_b_2.parquet", 2),
    ]
}

df_test = feature_eng(**data_store)

In [13]:
df_train = df_train.pipe(Pipeline.filter_cols)
df_test = df_test.select([col for col in df_train.columns if col != "target"])

df_train, cat_cols = to_pandas(df_train)
df_test, cat_cols = to_pandas(df_test, cat_cols)

del data_store

gc.collect()

0

In [14]:
# Model training and evaluation
# Implement model training, evaluation, and selection logic

X = df_train.drop(columns=["target", "case_id", "WEEK_NUM"])
y = df_train["target"]
weeks = df_train["WEEK_NUM"]

cv = StratifiedGroupKFold(n_splits=5, shuffle=False)

fitted_models = train_models(X, y, weeks, cv)
auc = evaluate_models(fitted_models, X, y)
print("Mean AUC on training data:", auc)



Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.813977
[200]	valid_0's auc: 0.82321
[300]	valid_0's auc: 0.826503
[400]	valid_0's auc: 0.827693
[500]	valid_0's auc: 0.828272
[600]	valid_0's auc: 0.82855
[700]	valid_0's auc: 0.828644
[800]	valid_0's auc: 0.828842
[900]	valid_0's auc: 0.829015
[1000]	valid_0's auc: 0.829203
Did not meet early stopping. Best iteration is:
[947]	valid_0's auc: 0.829214
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.815076
[200]	valid_0's auc: 0.823548
[300]	valid_0's auc: 0.826763
[400]	valid_0's auc: 0.827733
[500]	valid_0's auc: 0.828577
[600]	valid_0's auc: 0.828952
[700]	valid_0's auc: 0.829305
[800]	valid_0's auc: 0.829507
[900]	valid_0's auc: 0.829861
[1000]	valid_0's auc: 0.829868
Did not meet early stopping. Best iteration is:
[990]	valid_0's auc: 0.829908
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.820429
[200]	valid_0's auc: 0.82979

In [18]:
# Final model selection and saving
# Select the best model based on evaluation metrics and save it for future use

best_model = fitted_models[3]  # For demonstration, select the first model as the best model
save_model(best_model, "best_model.pkl")

In [19]:
X_test = df_test.drop(columns=["WEEK_NUM"])
X_test = X_test.set_index("case_id")

y_pred = pd.Series(best_model.predict_proba(X_test)[:, 1], index=X_test.index)

In [20]:
df_subm = pd.read_csv(ROOT / "sample_submission.csv")
df_subm = df_subm.set_index("case_id")

df_subm["score"] = y_pred

In [21]:
print("Check null: ", df_subm["score"].isnull().any())

df_subm.head()

Check null:  False


Unnamed: 0_level_0,score
case_id,Unnamed: 1_level_1
57543,0.193405
57549,0.133205
57551,0.068555
57552,0.063663
57569,0.045935


In [22]:
df_subm.to_csv("submission.csv")