In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import os
import gc
import numpy as np
import pandas as pd
import polars as pl
from glob import glob
from pathlib import Path
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import TimeSeriesSplit, GroupKFold, StratifiedGroupKFold
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

In [3]:
# Configure

IS_KAGGLE = False
INPUT_PATH = Path("/kaggle/input" if IS_KAGGLE else "D:/MS/PPNCKH/Data")
OUTPUT_PATH = Path("/kaggle/output" if IS_KAGGLE else "D:/MS/PPNCKH/Final/output")

RE_FEATURES_EXTRACTED = False

In [4]:
class Pipeline:
    
    # Set data type
    @staticmethod
    def set_table_dtypes(df):
        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int64))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
            elif col[-1] in ("D",):
                df = df.with_columns(pl.col(col).cast(pl.Date))
        return df

    # Handle dates
    @staticmethod
    def handle_dates(df):
        for col in df.columns:
            # Extract month and weekday from date columns
            if col[-1] in ("D",):
                # Subtract the "date_decision" column from the column specified by the variable `col`
                df = df.with_columns(pl.col(col) - pl.col("date_decision"))
                # Convert the date column to total number of days since the first date
                df = df.with_columns(pl.col(col).dt.total_days())
        # Drop the "date_decision" and "MONTH" columns
        df = df.drop("date_decision", "MONTH")

        return df

    # Filter columns
    # Filter 1: Remove columns with more than 95% missing values
    # Filter 2: Remove columns with only one unique value or more than 200 unique values
    @staticmethod
    def filter_cols(df):
        for col in df.columns:
            if col not in ["target", "case_id", "WEEK_NUM"]:
                isnull = df[col].is_null().mean()

                if isnull > 0.95:
                    df = df.drop(col)

        for col in df.columns:
            if (col not in ["target", "case_id", "WEEK_NUM"]) & (df[col].dtype == pl.String):
                freq = df[col].n_unique()

                if (freq == 1) | (freq > 200):
                    df = df.drop(col)

        return df

In [5]:
# Aggregate expression generator
class Aggregator:
    # Generate max aggregation expression for numerical columns
    @staticmethod
    def num_expr(df):
        cols = [col for col in df.columns if col[-1] in ("P", "A")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return expr_max

    # Generate max aggregation expression for date columns
    @staticmethod
    def date_expr(df):
        cols = [col for col in df.columns if col[-1] in ("D",)]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return expr_max

    # Generate max aggregation expression for string columns
    @staticmethod
    def str_expr(df):
        cols = [col for col in df.columns if col[-1] in ("M",)]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return expr_max

    # Generate max aggregation expression for other columns
    @staticmethod
    def other_expr(df):
        cols = [col for col in df.columns if col[-1] in ("T", "L")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return expr_max

    # Generate max aggregation expression for count columns
    @staticmethod
    def count_expr(df):
        cols = [col for col in df.columns if "num_group" in col]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return expr_max

    # Generate all aggregation expressions
    @staticmethod
    def get_exprs(df):
        exprs = Aggregator.num_expr(df) + \
                Aggregator.date_expr(df) + \
                Aggregator.str_expr(df) + \
                Aggregator.other_expr(df) + \
                Aggregator.count_expr(df)
        return exprs

In [6]:
# Read a single file and perform preprocessing
def read_file(path, depth=None):
    # Use Polars to read the parquet file
    df = pl.read_parquet(path)
    # Set the data types of the columns
    df = df.pipe(Pipeline.set_table_dtypes)
    # If depth is 1 or 2, group by case_id and aggregate the columns
    if depth in [1, 2]:
        df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
    return df

# Read multiple files and perform preprocessing
def read_files(regex_path, depth=None):
    chunks = []
    # Use glob to find all files that match the regex pattern
    for path in glob(str(regex_path)):
        # Read the parquet file using Polars and set the data types
        chunks.append(pl.read_parquet(path).pipe(Pipeline.set_table_dtypes))
    # Concatenate the data frames vertically
    df = pl.concat(chunks, how="vertical_relaxed")
    # If depth is 1 or 2, group by case_id and aggregate the columns
    if depth in [1, 2]:
        df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
    return df

In [7]:
# Feature engineering function to join the data frames and create new features
def feature_eng(df_base, depth_0, depth_1, depth_2):
    # Extract the month and weekday from the date_decision column
    df_base = (
        df_base
        .with_columns(
            month_decision = pl.col("date_decision").dt.month(),
            weekday_decision = pl.col("date_decision").dt.weekday(),
        )
    )
    # Join the data frames on the case_id column
    for i, df in enumerate(depth_0 + depth_1 + depth_2):
        df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
    # Handle missing values
    df_base = df_base.pipe(Pipeline.handle_dates)
    return df_base

In [8]:
# Convert Polars DataFrame to Pandas DataFrame and handle categorical columns
def to_pandas(df_data, cat_cols=None):
    df_data = df_data.to_pandas(use_pyarrow_extension_array=False)
    if cat_cols is None:
        cat_cols = list(df_data.select_dtypes("object").columns)
    df_data[cat_cols] = df_data[cat_cols].astype("category")
    return df_data, cat_cols

In [9]:
ROOT            = INPUT_PATH / "home-credit-credit-risk-model-stability"
TRAIN_DIR       = ROOT / "parquet_files" / "train"
TEST_DIR        = ROOT / "parquet_files" / "test"
SAVED_DF_TRAIN  = OUTPUT_PATH / "df_train.parquet"
SAVED_DF_TEST   = OUTPUT_PATH / "df_test.parquet"

In [10]:
if RE_FEATURES_EXTRACTED or not SAVED_DF_TRAIN.exists():
    print("Create features training data ...")
    # Read the training data
    data_store = {
        "df_base": read_file(TRAIN_DIR / "train_base.parquet"),
        "depth_0": [
            read_file(TRAIN_DIR / "train_static_cb_0.parquet"),
            read_files(TRAIN_DIR / "train_static_0_*.parquet"),
        ],
        "depth_1": [
            read_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
            read_file(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
            read_file(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
            read_file(TRAIN_DIR / "train_tax_registry_c_1.parquet", 1),
            read_file(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
            read_file(TRAIN_DIR / "train_other_1.parquet", 1),
            read_file(TRAIN_DIR / "train_person_1.parquet", 1),
            read_file(TRAIN_DIR / "train_deposit_1.parquet", 1),
            read_file(TRAIN_DIR / "train_debitcard_1.parquet", 1),
        ],
        "depth_2": [
            read_file(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
        ]
    }
    # Feature engineering on the training data
    df_train = feature_eng(**data_store)
    # Save the training data
    df_train.write_parquet(SAVED_DF_TRAIN)
else:
    print("Reading the saved features training data ...")
    df_train = pl.read_parquet(SAVED_DF_TRAIN)

print("train data shape:\t", df_train.shape)

Reading the saved features training data ...
train data shape:	 (1526659, 376)


In [11]:
if RE_FEATURES_EXTRACTED or not SAVED_DF_TEST.exists():
    print("Create features test data ...")
    # Read the test data
    data_store = {
        "df_base": read_file(TEST_DIR / "test_base.parquet"),
        "depth_0": [
            read_file(TEST_DIR / "test_static_cb_0.parquet"),
            read_files(TEST_DIR / "test_static_0_*.parquet"),
        ],
        "depth_1": [
            read_files(TEST_DIR / "test_applprev_1_*.parquet", 1),
            read_file(TEST_DIR / "test_tax_registry_a_1.parquet", 1),
            read_file(TEST_DIR / "test_tax_registry_b_1.parquet", 1),
            read_file(TEST_DIR / "test_tax_registry_c_1.parquet", 1),
            read_file(TEST_DIR / "test_credit_bureau_b_1.parquet", 1),
            read_file(TEST_DIR / "test_other_1.parquet", 1),
            read_file(TEST_DIR / "test_person_1.parquet", 1),
            read_file(TEST_DIR / "test_deposit_1.parquet", 1),
            read_file(TEST_DIR / "test_debitcard_1.parquet", 1),
        ],
        "depth_2": [
            read_file(TEST_DIR / "test_credit_bureau_b_2.parquet", 2),
        ]
    }
    
    # Feature engineering on the test data
    df_test = feature_eng(**data_store)
    # Save the test data
    df_test.write_parquet(SAVED_DF_TEST)
else:
    print("Reading the saved features test data ...")
    df_test = pl.read_parquet(SAVED_DF_TEST)
    
print("test data shape:\t", df_test.shape)

Reading the saved features test data ...
test data shape:	 (10, 375)


In [12]:
# Filter columns
df_train = df_train.pipe(Pipeline.filter_cols)
df_test = df_test.select([col for col in df_train.columns if col != "target"])

print("train data shape:\t", df_train.shape)
print("test data shape:\t", df_test.shape)

train data shape:	 (1526659, 267)
test data shape:	 (10, 266)


In [13]:
# convert to pandas
df_train, cat_cols = to_pandas(df_train)
df_test, cat_cols = to_pandas(df_test, cat_cols)

In [17]:
# Garbage collection to free up memory
if 'data_store' in globals():
    del data_store
gc.collect()

1712

In [18]:
# Create a custom VotingModel class, API similar to sklearn
class VotingModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimators):
        super().__init__()
        self.estimators = estimators

    def fit(self, X, y=None):
        return self

    def predict(self, X):
        # Average the predictions of all the estimators
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

    def predict_proba(self, X):
        # Average the predicted probabilities of all the estimators
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

In [23]:
# Define the features and target variable
X = df_train.drop(columns=["target", "case_id","WEEK_NUM"])
y = df_train["target"]
weeks = df_train["WEEK_NUM"]

# Define the cross-validation strategy
cv = StratifiedGroupKFold(n_splits=5, shuffle=False)

# Define the LightGBM parameters
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 10,
    "learning_rate": 0.05,
    "max_bin": 255,
    "n_estimators": 1200,
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "verbose": -1,
    "random_state": 42,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "extra_trees":True,
    'num_leaves':64,
    "device": "gpu",
}

fitted_models = []
cv_scores = []

# Cross-validation loop to train the model
for idx_train, idx_valid in cv.split(X, y, groups=weeks):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

    print("Train week range: ", weeks.iloc[idx_train].unique())
    print("Valid week range: ", weeks.iloc[idx_valid].unique())
    
    print("Train shape: ", X_train.shape)
    print("Valid shape: ", X_valid.shape)
    # 
    # model = lgb.LGBMClassifier(**params)
    # model.fit(
    #     X_train, y_train,
    #     eval_set=[(X_valid, y_valid)],
    #     callbacks=[lgb.log_evaluation(50), lgb.early_stopping(50)]
    # )
    # 
    # fitted_models.append(model)
    # 
    # y_pred_valid = model.predict_proba(X_valid)[:, 1]
    # auc_score = roc_auc_score(y_valid, y_pred_valid)
    # cv_scores.append(auc_score)

# model = VotingModel(fitted_models)
# print("CV AUC scores: ", cv_scores)
# print("Average CV AUC score: ", sum(cv_scores) / len(cv_scores))

Train week range:  [ 0  1  2  4  5  6  7  8 10 11 12 13 15 17 18 19 20 21 22 23 24 26 27 29
 28 30 31 32 33 34 35 36 37 38 39 41 42 44 45 46 47 48 49 50 55 56 57 58
 60 61 62 65 66 69 70 72 73 74 75 76 77 78 80 81 83 84 85 86 87 88 89 91]
Valid week range:  [ 3  9 14 16 25 40 43 51 52 53 54 59 63 64 67 68 71 79 82 90]
Train shape:  (1221056, 264)
Valid shape:  (305603, 264)
Train week range:  [ 1  2  3  4  5  8  9 10 11 12 13 14 16 17 19 21 22 23 24 25 26 27 29 28
 30 31 32 34 37 40 41 42 43 44 45 46 48 49 50 51 52 53 54 55 56 57 58 59
 61 62 63 64 65 66 67 68 70 71 72 73 74 75 76 77 79 80 82 83 84 85 86 88
 90 91]
Valid week range:  [ 0  6  7 15 18 20 33 35 36 38 39 47 60 69 78 81 87 89]
Train shape:  (1221238, 264)
Valid shape:  (305421, 264)
Train week range:  [ 0  1  3  4  5  6  7  8  9 10 11 13 14 15 16 17 18 19 20 22 23 24 25 26
 27 29 31 33 35 36 37 38 39 40 41 43 44 45 46 47 50 51 52 53 54 55 57 58
 59 60 62 63 64 65 66 67 68 69 71 72 74 75 77 78 79 81 82 83 84 85 87 88
 89 90]

In [20]:
weeks

0           0
1           0
2           0
3           0
4           0
           ..
1526654    91
1526655    91
1526656    91
1526657    91
1526658    91
Name: WEEK_NUM, Length: 1526659, dtype: int64

In [17]:
X_test = df_test.drop(columns=["WEEK_NUM"])
X_test = X_test.set_index("case_id")

lgb_pred = pd.Series(model.predict_proba(X_test)[:, 1], index=X_test.index)

#### Submission

In [18]:
df_subm = pd.read_csv(ROOT / "sample_submission.csv")
df_subm = df_subm.set_index("case_id")

df_subm["score"] = lgb_pred

In [19]:
print("Check null: ", df_subm["score"].isnull().any())

Check null:  False


In [20]:
df_subm.head()

Unnamed: 0_level_0,score
case_id,Unnamed: 1_level_1
57543,0.006857
57549,0.028016
57551,0.006491
57552,0.006775
57569,0.079827


In [21]:
df_subm.to_csv("submission.csv")