# Example Notebook

Welcome to the example notebook for the Home Credit Kaggle competition. The goal of this competition is to determine how likely a customer is going to default on an issued loan. The main difference between the [first](https://www.kaggle.com/c/home-credit-default-risk) and this competition is that now your submission will be scored with a custom metric that will take into account how well the model performs in future. A decline in performance will be penalized. The goal is to create a model that is stable and performs well in the future.

In this notebook you will see how to:
* Load the data
* Join tables with Polars - a DataFrame library implemented in Rust language, designed to be blazingy fast and memory efficient.  
* Create simple aggregation features
* Train a LightGBM model
* Create a submission table

## Load the data

In [17]:
import polars as pl
import numpy as np
import pandas as pd
import lightgbm as lgb
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder
from sklearn.metrics import roc_auc_score 
from sklearn.utils import shuffle
import warnings
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction import FeatureHasher
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras import optimizers
from tensorflow.keras import models
from tensorflow.keras import layers
from keras.src.layers import RNN
from tensorflow.keras import optimizers
import matplotlib.pyplot as plt
from keras.callbacks import EarlyStopping


dataPath = "/kaggle/input/home-credit-credit-risk-model-stability/"

In [2]:
def set_table_dtypes(df: pl.DataFrame) -> pl.DataFrame:
    # implement here all desired dtypes for tables
    # the following is just an example
    for col in df.columns:
        # last letter of column name will help you determine the type
        if col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))

    return df

def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:  
        if df[col].dtype.name in ['object', 'string']:
            df[col] = df[col].astype("string").astype('category')
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype)
    return df

In [3]:
train_basetable = pl.read_csv(dataPath + "csv_files/train/train_base.csv")
train_static = pl.concat(
    [
        pl.read_csv(dataPath + "csv_files/train/train_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/train/train_static_0_1.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
train_static_cb = pl.read_csv(dataPath + "csv_files/train/train_static_cb_0.csv").pipe(set_table_dtypes)
train_person_1 = pl.read_csv(dataPath + "csv_files/train/train_person_1.csv").pipe(set_table_dtypes) 
train_credit_bureau_b_2 = pl.read_csv(dataPath + "csv_files/train/train_credit_bureau_b_2.csv").pipe(set_table_dtypes) 

In [4]:
test_basetable = pl.read_csv(dataPath + "csv_files/test/test_base.csv")
test_static = pl.concat(
    [
        pl.read_csv(dataPath + "csv_files/test/test_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/test/test_static_0_1.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/test/test_static_0_2.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
test_static_cb = pl.read_csv(dataPath + "csv_files/test/test_static_cb_0.csv").pipe(set_table_dtypes)
test_person_1 = pl.read_csv(dataPath + "csv_files/test/test_person_1.csv").pipe(set_table_dtypes) 
test_credit_bureau_b_2 = pl.read_csv(dataPath + "csv_files/test/test_credit_bureau_b_2.csv").pipe(set_table_dtypes) 

## Feature engineering

In this part, we can see a simple example of joining tables via `case_id`. Here the loading and joining is done with polars library. Polars library is blazingly fast and has much smaller memory footprint than pandas. 

In [5]:
# We need to use aggregation functions in tables with depth > 1, so tables that contain num_group1 column or 
# also num_group2 column.
train_person_1_feats_1 = train_person_1.group_by("case_id").agg(
    pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
    (pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployed")
)

# Here num_group1=0 has special meaning, it is the person who applied for the loan.
train_person_1_feats_2 = train_person_1.select(["case_id", "num_group1", "housetype_905L"]).filter(
    pl.col("num_group1") == 0
).drop("num_group1").rename({"housetype_905L": "person_housetype"})

# Here we have num_goup1 and num_group2, so we need to aggregate again.
train_credit_bureau_b_2_feats = train_credit_bureau_b_2.group_by("case_id").agg(
    pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
    (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31")
)

# We will process in this examples only A-type and M-type columns, so we need to select them.
selected_static_cols = []
for col in train_static.columns:
    if col[-1] in ("A", "M"):
        selected_static_cols.append(col)
print(selected_static_cols)

selected_static_cb_cols = []
for col in train_static_cb.columns:
    if col[-1] in ("A", "M"):
        selected_static_cb_cols.append(col)
print(selected_static_cb_cols)

# Join all tables together.
data = train_basetable.join(
    train_static.select(["case_id"]+selected_static_cols), how="left", on="case_id"
).join(
    train_static_cb.select(["case_id"]+selected_static_cb_cols), how="left", on="case_id"
).join(
    train_person_1_feats_1, how="left", on="case_id"
).join(
    train_person_1_feats_2, how="left", on="case_id"
).join(
    train_credit_bureau_b_2_feats, how="left", on="case_id"
)

['amtinstpaidbefduel24m_4187115A', 'annuity_780A', 'annuitynextmonth_57A', 'avginstallast24m_3658937A', 'avglnamtstart24m_4525187A', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'credamount_770A', 'currdebt_22A', 'currdebtcredtyperange_828A', 'disbursedcredamount_1113A', 'downpmt_116A', 'inittransactionamount_650A', 'lastapprcommoditycat_1041M', 'lastapprcommoditytypec_5251766M', 'lastapprcredamount_781A', 'lastcancelreason_561M', 'lastotherinc_902A', 'lastotherlnsexpense_631A', 'lastrejectcommoditycat_161M', 'lastrejectcommodtypec_5251769M', 'lastrejectcredamount_222A', 'lastrejectreason_759M', 'lastrejectreasonclient_4145040M', 'maininc_215A', 'maxannuity_159A', 'maxannuity_4075009A', 'maxdebt4_972A', 'maxinstallast24m_3658928A', 'maxlnamtstart6m_4525199A', 'maxoutstandbalancel12m_4187113A', 'maxpmtlast3m_4525190A', 'previouscontdistrict_112M', 'price_1097A', 'sumoutstandtotal_3546847A', 'sumoutstandtotalest_4493215A', 'totaldebt_9A', 'totalsettled_863A', 'totinstallas

In [6]:
test_person_1_feats_1 = test_person_1.group_by("case_id").agg(
    pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
    (pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployed")
)

test_person_1_feats_2 = test_person_1.select(["case_id", "num_group1", "housetype_905L"]).filter(
    pl.col("num_group1") == 0
).drop("num_group1").rename({"housetype_905L": "person_housetype"})

test_credit_bureau_b_2_feats = test_credit_bureau_b_2.group_by("case_id").agg(
    pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
    (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31")
)

data_submission = test_basetable.join(
    test_static.select(["case_id"]+selected_static_cols), how="left", on="case_id"
).join(
    test_static_cb.select(["case_id"]+selected_static_cb_cols), how="left", on="case_id"
).join(
    test_person_1_feats_1, how="left", on="case_id"
).join(
    test_person_1_feats_2, how="left", on="case_id"
).join(
    test_credit_bureau_b_2_feats, how="left", on="case_id"
)

In [7]:
data_submission

case_id,date_decision,MONTH,WEEK_NUM,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,credamount_770A,currdebt_22A,currdebtcredtyperange_828A,disbursedcredamount_1113A,downpmt_116A,inittransactionamount_650A,lastapprcommoditycat_1041M,lastapprcommoditytypec_5251766M,lastapprcredamount_781A,lastcancelreason_561M,lastotherinc_902A,lastotherlnsexpense_631A,lastrejectcommoditycat_161M,lastrejectcommodtypec_5251769M,lastrejectcredamount_222A,lastrejectreason_759M,lastrejectreasonclient_4145040M,maininc_215A,maxannuity_159A,maxannuity_4075009A,maxdebt4_972A,maxinstallast24m_3658928A,maxlnamtstart6m_4525199A,maxoutstandbalancel12m_4187113A,maxpmtlast3m_4525190A,previouscontdistrict_112M,price_1097A,sumoutstandtotal_3546847A,sumoutstandtotalest_4493215A,totaldebt_9A,totalsettled_863A,totinstallast1m_4525188A,description_5085714M,education_1103M,education_88M,maritalst_385M,maritalst_893M,pmtaverage_3A,pmtaverage_4527227A,pmtaverage_4955615A,pmtssum_45A,mainoccupationinc_384A_max,mainoccupationinc_384A_any_selfemployed,person_housetype,pmts_pmtsoverdue_635A_max,pmts_dpdvalue_108P_over31
i64,str,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,str,f64,f64,str,str,f64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,str,str,str,str,str,f64,f64,f64,f64,f64,bool,str,f64,bool
57543,"""2021-05-14""",202201,100,191767.36,3674.6,1218.2001,16049.4,17054.4,14554.4,24482.0,20000.0,12154.4,0.0,20000.0,0.0,,"""a55475b1""","""a55475b1""",14000.0,"""a55475b1""",,,"""P109_133_183""","""P49_111_165""",24000.0,"""a55475b1""","""a55475b1""",34000.0,280983.56,,231440.03,131700.8,16672.6,157731.78,16641.4,"""a55475b1""",0.0,12154.4,12154.4,12154.4,456031.1,17859.6,"""2fc785b2""","""6b2ae0fa""","""a55475b1""","""38c061ee""","""a55475b1""",,,,,34000.0,False,,,
57549,"""2022-01-17""",202201,100,129704.4,5742.6,3546.6,32426.201,118964.805,13681.714,32426.201,75000.0,10638.2,10638.2,75000.0,0.0,,"""a55475b1""","""a55475b1""",94000.0,"""a55475b1""",,,"""a55475b1""","""a55475b1""",160000.0,"""a55475b1""","""P30_86_84""",44000.0,337659.8,,34066.0,122511.4,31820.6,21278.0,122511.4,"""a55475b1""",,10638.2,10638.2,10638.2,373720.84,126058.0,"""2fc785b2""","""39a0853f""","""a55475b1""","""a7fcb6e5""","""a55475b1""",,,26815.6,,49800.0,False,,,
57551,"""2020-11-27""",202201,100,71036.4,2844.6,0.0,8357.2,,0.0,9551.0,27095.201,0.0,0.0,27095.201,0.0,,"""a55475b1""","""a55475b1""",200000.0,"""P85_114_140""",,,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",70000.0,83400.0,,54000.0,41783.402,54000.0,62619.0,,"""P11_36_178""",27095.201,0.0,0.0,0.0,75219.0,,"""2fc785b2""","""6b2ae0fa""","""a55475b1""","""3439d993""","""a55475b1""",,,,,59600.0,False,,,
57552,"""2020-11-27""",202201,100,183992.0,6298.8003,12155.4,7440.4,,199322.4,9148.4,100000.0,191269.61,191269.61,100000.0,0.0,,"""a55475b1""","""a55475b1""",0.0,"""P94_109_143""",,,"""a55475b1""","""a55475b1""",150000.0,"""a55475b1""","""P94_109_143""",,110500.0,,188126.14,12155.4,104473.6,288642.6,12155.4,"""P21_87_50""",,191269.61,191269.61,191269.61,284213.0,18889.0,"""2fc785b2""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,23402.8,,112000.0,False,"""OWNED""",,
57569,"""2021-12-20""",202201,100,0.0,4682.6,0.0,,,,10796.4,60000.0,0.0,0.0,60000.0,0.0,,"""a55475b1""","""a55475b1""",20000.0,"""P94_109_143""",,,"""a55475b1""","""a55475b1""",40000.0,"""a55475b1""","""P94_109_143""",6000.0,37704.0,,64555.668,,,0.0,,"""a55475b1""",,0.0,0.0,0.0,95348.42,,"""2fc785b2""","""717ddd49""","""a55475b1""","""3439d993""","""a55475b1""",,,17333.6,,58000.0,False,"""OWNED""",,
57630,"""2021-03-16""",202201,100,0.0,8905.0,0.0,,,,,96174.0,0.0,0.0,96174.0,0.0,,"""P148_110_5""","""P161_88_182""",8876.0,"""P198_89_166""",,,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",12000.0,1382.8,,0.0,,,,,"""a55475b1""",96174.0,0.0,0.0,0.0,9677.601,,"""2fc785b2""","""6b2ae0fa""","""a55475b1""","""3439d993""","""a55475b1""",,,,,60000.0,False,"""OWNED""",,
57631,"""2022-06-04""",202201,100,,2540.6,0.0,,,,,24920.0,0.0,0.0,24920.0,0.0,,"""a55475b1""","""a55475b1""",,"""P94_109_143""",,,"""P100_96_175""","""P165_57_169""",46279.8,"""P45_84_106""","""P94_109_143""",,0.0,,0.0,,,,,"""a55475b1""",24920.0,,,0.0,0.0,,"""2fc785b2""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,,,,,,,
57632,"""2022-02-05""",202201,100,63647.402,4732.0,0.0,3536.0,,10581.714,3536.0,25998.0,0.0,0.0,25998.0,0.0,,"""P53_45_92""","""P200_75_140""",50116.0,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",56000.0,7000.0,,63647.402,3536.0,63647.402,42412.0,3536.0,"""P159_143_123""",25998.0,0.0,0.0,0.0,63652.0,7071.4,"""2fc785b2""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,15841.2,,,,,,
57633,"""2022-01-25""",202201,100,,8273.0,0.0,,,,,200000.0,0.0,0.0,200000.0,0.0,,"""a55475b1""","""a55475b1""",,"""P85_114_140""",,,"""P159_130_59""","""P75_90_70""",64996.0,"""P45_84_106""","""P94_109_143""",,0.0,,0.0,,,,,"""a55475b1""",0.0,,,0.0,0.0,,"""2fc785b2""","""a55475b1""","""a55475b1""","""3439d993""","""a55475b1""",,,,,,,,,
57634,"""2021-01-27""",202201,100,39948.8,1165.8,0.0,3994.8,,1675.4,3358.4001,12108.2,0.0,0.0,12108.2,0.0,,"""P159_130_59""","""P174_113_42""",16494.201,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",50000.0,5000.0,,19798.0,4949.6,20887.201,20150.8,,"""a55475b1""",13998.0,0.0,0.0,0.0,39950.8,,"""2fc785b2""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,,,,,,,


In [8]:
case_ids = data["case_id"].unique().shuffle(seed=1)
case_ids_train, case_ids_test = train_test_split(case_ids, train_size=0.6, random_state=1)
case_ids_valid, case_ids_test = train_test_split(case_ids_test, train_size=0.5, random_state=1)

cols_pred = []
for col in data.columns:
    if col[-1].isupper() and col[:-1].islower():
        cols_pred.append(col)

print(cols_pred)

def from_polars_to_pandas(case_ids: pl.DataFrame) -> pl.DataFrame:
    return (
        data.filter(pl.col("case_id").is_in(case_ids))[["case_id", "WEEK_NUM", "target"]].to_pandas(),
        data.filter(pl.col("case_id").is_in(case_ids))[cols_pred].to_pandas(),
        data.filter(pl.col("case_id").is_in(case_ids))["target"].to_pandas()
    )

base_train, X_train, y_train = from_polars_to_pandas(case_ids_train)
base_valid, X_valid, y_valid = from_polars_to_pandas(case_ids_valid)
base_test, X_test, y_test = from_polars_to_pandas(case_ids_test)

for df in [X_train, X_valid, X_test]:
    df = convert_strings(df)

['amtinstpaidbefduel24m_4187115A', 'annuity_780A', 'annuitynextmonth_57A', 'avginstallast24m_3658937A', 'avglnamtstart24m_4525187A', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'credamount_770A', 'currdebt_22A', 'currdebtcredtyperange_828A', 'disbursedcredamount_1113A', 'downpmt_116A', 'inittransactionamount_650A', 'lastapprcommoditycat_1041M', 'lastapprcommoditytypec_5251766M', 'lastapprcredamount_781A', 'lastcancelreason_561M', 'lastotherinc_902A', 'lastotherlnsexpense_631A', 'lastrejectcommoditycat_161M', 'lastrejectcommodtypec_5251769M', 'lastrejectcredamount_222A', 'lastrejectreason_759M', 'lastrejectreasonclient_4145040M', 'maininc_215A', 'maxannuity_159A', 'maxannuity_4075009A', 'maxdebt4_972A', 'maxinstallast24m_3658928A', 'maxlnamtstart6m_4525199A', 'maxoutstandbalancel12m_4187113A', 'maxpmtlast3m_4525190A', 'previouscontdistrict_112M', 'price_1097A', 'sumoutstandtotal_3546847A', 'sumoutstandtotalest_4493215A', 'totaldebt_9A', 'totalsettled_863A', 'totinstallas

In [9]:
print(f"Train: {X_train.shape}")
print(f"Valid: {X_valid.shape}")
print(f"Test: {X_test.shape}")

Train: (915995, 48)
Valid: (305332, 48)
Test: (305332, 48)


In [10]:
def gen_synthetic(x_data: pd.DataFrame, n: int, y_data: pd.Series, syn_type: int):
    """
    x: X_train
    n: number of entries to generate
    y: y_train
    syn_type: class to generate, either 0 or 1
    """
    
    
    x_data = x_data.assign(target=y_data.values)
    x_data = x_data[x_data['target'] == syn_type]
    x_data = x_data.drop('target', axis = 1)
    
    syn_y = []

    syn_data = {}
    for col in x_data.columns.to_list():
        syn_data[col] = []
        data = x_data[col].value_counts().index.to_list()
        marginal = list(marginals(x_data, col).values())
        synthetic = np.random.choice(data, size=n, p=marginal)
        
        for syn_data_point in synthetic:
            syn_data[col].append(syn_data_point)
    
    for i in range(n):
        syn_y.append(1)
        
    syn_x = pd.DataFrame.from_dict(syn_data)
    syn_y = pd.Series(syn_y)
    return syn_x, syn_y

def marginals(df: pd.DataFrame, col: str) -> {}:
    """
    maps the probabilty of an occurence to the occurence
    """
    data = df[col].value_counts()
    results = [x for x in data]
    labels = df[col].value_counts().index.to_list()
    syn_rep = {}
    
    for x in range(len(labels)):
        syn_rep[labels[x]] = max(0, results[x])
    
    total = sum(syn_rep.values())
    
    marginal = {}
    for x in labels:
        marginal[x] = syn_rep[x] / total
    return marginal

def preprocess_data(x: pd.DataFrame, synthetic: bool, y: pd.DataFrame = [], synth_class: int = 1) -> pd.DataFrame:
    '''
    executes preprocessing as one function for ease of use
    
    @param
    x: data to preprocess
    y: target data for synthetic
    synthetic: whether or not to enhance data
    synth_class: which class to enhance
    '''

    # FILL CATEGORICAL FEATURE NAN VALUES
    index = x.dtypes.index.to_list()
    categoricals = {}
        
    for i in range(len(x.dtypes)):
    
        if x.dtypes.iloc[i] == "float32" or x.dtypes.iloc[i] == "float64" or x.dtypes.iloc[i] == "int32":
            continue
        else:
            categoricals[index[i]] = str(x.dtypes.iloc[i])

            data = x[index[i]].value_counts().index.to_list()

            x = x.assign(**{index[i]:x[index[i]].fillna(data[0])})
            
    for col in x.columns:
        if x[col].isna().sum() > 0:
            x = x.assign(**{col:x[col].fillna(x[col].mean())})
            
    if synthetic == False:

            # FEATURE TYPE CONVERSION
        float64_cols = list(x.select_dtypes(include='float64'))

        # The same code again calling the columns
        x[float64_cols] = x[float64_cols].astype('float32')

        encoder = OrdinalEncoder()
        encoder.fit(x[categoricals.keys()])

        x[list(categoricals.keys())] = encoder.transform(x[list(categoricals.keys())])
        
        normalize_x = x.copy()

        # apply normalization techniques 
        for column in normalize_x.columns: 
            normalize_x[column] = (normalize_x[column] - normalize_x[column].min()) / (normalize_x[column].max() - normalize_x[column].min())

        return normalize_x
    
    syn_x, syn_y = gen_synthetic(x_data = x, n = len(x), y_data = y, syn_type = synth_class)

    syn_X_train = pd.concat([syn_x, x])
    syn_y_train = pd.concat([syn_y, y])

    # Merge and shuffle data
    syn_X_train['target'] = syn_y_train
    syn_X_train = syn_X_train.sample(frac = 1)

    syn_y_train = syn_X_train['target']
    syn_X_train = syn_X_train.drop('target', axis = 1)

    # FEATURE TYPE CONVERSION
    float64_cols = list(syn_X_train.select_dtypes(include='float64'))

    # The same code again calling the columns
    syn_X_train[float64_cols] = syn_X_train[float64_cols].astype('float32')

    encoder = OrdinalEncoder()
    encoder.fit(syn_X_train[categoricals.keys()])

    syn_X_train[list(categoricals.keys())] = encoder.transform(syn_X_train[list(categoricals.keys())])

    normalize_syn_x = syn_X_train.copy()

    # apply normalization techniques 
    for column in normalize_syn_x.columns: 
        normalize_syn_x[column] = (normalize_syn_x[column] - normalize_syn_x[column].min()) / (normalize_syn_x[column].max() - normalize_syn_x[column].min())

    return normalize_syn_x, syn_y_train



In [12]:
X_train

Unnamed: 0,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,credamount_770A,currdebt_22A,currdebtcredtyperange_828A,...,totinstallast1m_4525188A,description_5085714M,education_1103M,education_88M,maritalst_385M,maritalst_893M,pmtaverage_3A,pmtaverage_4527227A,pmtaverage_4955615A,pmtssum_45A
0,,1917.6000,0.0,,,,,30000.0,0.0,0.0,...,,,,,,,,,,
1,,4937.0000,0.0,,,,,78000.0,0.0,0.0,...,,,,,,,,,,
2,,3600.0000,0.0,,,,,60000.0,0.0,0.0,...,,,,,,,,,,
3,,3110.8000,0.0,,,,,20000.0,0.0,0.0,...,,,,,,,,,,
4,,1218.0000,0.0,,,,,20300.0,0.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915990,104557.25,8218.0000,5293.2,8027.2000,,23399.828,5293.2,132000.0,10548.4,10548.4,...,10586.400,2fc785b2,a55475b1,a55475b1,a55475b1,a55475b1,,,,
915991,176561.36,3675.4001,0.0,7356.8003,,16392.496,6750.2,30000.0,0.0,0.0,...,14346.319,2fc785b2,a55475b1,a55475b1,a55475b1,a55475b1,,,12155.0,
915992,14232.40,7788.8003,0.0,2662.4001,,,1500.6,60000.0,0.0,0.0,...,,2fc785b2,a55475b1,a55475b1,a55475b1,a55475b1,,,,
915993,197371.58,1195.4000,2827.2,8212.6010,,47943.062,9921.2,6000.0,46806.6,46806.6,...,5654.400,2fc785b2,6b2ae0fa,a55475b1,3439d993,a55475b1,,,15792.4,


In [18]:
x_train, y_train = preprocess_data(X_train.head(int(len(X_train) / 2)), synthetic = True, y = y_train.head(int(len(y_train) / 2)))
x_valid, y_valid = preprocess_data(X_valid.head(int(len(X_valid) / 2)), synthetic = True, y = y_valid.head(int(len(y_valid) / 2)))

## Training LightGBM

Minimal example of LightGBM training is shown below.

In [29]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import models
from tensorflow.keras import layers
from keras.src.layers import RNN
from tensorflow.keras import optimizers
import matplotlib.pyplot as plt
from keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=2)

lr_schedule = keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate = 0.00001,
    decay_steps=2000,
    decay_rate=0.97,
    staircase=True)

cnn = models.Sequential([
    layers.Conv1D(filters = 64, kernel_size = 17, activation='relu', padding = 'same', input_shape=(48, 1)),
    layers.Conv1D(filters = 64, kernel_size = 12, activation='relu', padding = 'same'),
    layers.Conv1D(filters = 128, kernel_size = 9, activation='relu', padding = 'same'),
    layers.MaxPooling1D(1, padding = 'same'),
    layers.Conv1D(filters = 64, kernel_size = 12, activation='relu', padding = 'same'),
    layers.Conv1D(filters = 128, kernel_size = 9, activation='relu', padding = 'same'),
    layers.Conv1D(filters = 164, kernel_size = 7, activation='relu', padding = 'same'),
    layers.MaxPooling1D(1, padding = 'same'),
    layers.SimpleRNN(128),
    layers.Dropout(.5),
    layers.Flatten(),
    layers.Dense(1024, activation='sigmoid'),
    layers.Dense(1, activation='sigmoid')
])

cnn.summary()


#refer to https://www.tensorflow.org/tutorials/keras/classification tutorial to check on how to use compile function
cnn.compile(optimizer='rmsprop', loss = 'binary_crossentropy', metrics=['accuracy'])

# This will start the training and save each epoch output in the history list.
history_cnn = cnn.fit(x_train, y_train, batch_size=64, epochs=100, validation_data=(x_valid, y_valid), callbacks=[early_stopping])

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_49 (Conv1D)          (None, 48, 64)            1152      
                                                                 
 conv1d_50 (Conv1D)          (None, 48, 64)            49216     
                                                                 
 conv1d_51 (Conv1D)          (None, 48, 128)           73856     
                                                                 
 max_pooling1d_18 (MaxPooli  (None, 48, 128)           0         
 ng1D)                                                           
                                                                 
 conv1d_52 (Conv1D)          (None, 48, 64)            98368     
                                                                 
 conv1d_53 (Conv1D)          (None, 48, 128)           73856     
                                                     

Evaluation with AUC and then comparison with the stability metric is shown below.

## Submission

Scoring the submission dataset is below, we need to take care of new categories. Then we save the score as a last step. 

In [38]:
x_test = preprocess_data(X_test, False)

In [47]:
x_test

Unnamed: 0,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,credamount_770A,currdebt_22A,currdebtcredtyperange_828A,...,totinstallast1m_4525188A,description_5085714M,education_1103M,education_88M,maritalst_385M,maritalst_893M,pmtaverage_3A,pmtaverage_4527227A,pmtaverage_4955615A,pmtssum_45A
0,0.053186,0.044113,0.000000,0.013558,0.091387,0.653990,0.016146,0.070234,0.000000,0.000000,...,0.020382,1.0,0.75,0.75,0.4,0.6,0.116202,0.086563,0.178848,0.033336
1,0.053186,0.126469,0.000000,0.013558,0.091387,0.653990,0.016146,0.163880,0.000000,0.000000,...,0.020382,1.0,0.75,0.75,0.4,0.6,0.116202,0.086563,0.178848,0.033336
2,0.053186,0.067032,0.000000,0.013558,0.091387,0.653990,0.016146,0.130435,0.000000,0.000000,...,0.020382,1.0,0.75,0.75,0.4,0.6,0.116202,0.086563,0.178848,0.033336
3,0.053186,0.033327,0.000000,0.013558,0.091387,0.653990,0.016146,0.043478,0.000000,0.000000,...,0.020382,1.0,0.75,0.75,0.4,0.6,0.116202,0.086563,0.178848,0.033336
4,0.053186,0.030914,0.000000,0.013558,0.091387,0.653990,0.016146,0.063545,0.000000,0.000000,...,0.020382,1.0,0.75,0.75,0.4,0.6,0.116202,0.086563,0.178848,0.033336
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305327,0.113168,0.054086,0.000000,0.014178,0.091387,0.645922,0.017197,0.063545,0.000000,0.000000,...,0.024344,0.0,0.50,0.75,0.0,0.6,0.116202,0.086563,0.161788,0.033336
305328,0.000000,0.062202,0.000000,0.013558,0.091387,0.653990,0.016146,0.096990,0.000000,0.000000,...,0.020382,0.0,0.25,0.00,0.0,0.2,0.116202,0.086563,0.178848,0.033336
305329,0.318786,0.093412,0.084279,0.033442,0.091387,0.674357,0.036373,0.163880,0.073793,0.101128,...,0.014115,0.0,0.50,0.75,0.0,0.6,0.116202,0.086563,0.206380,0.033336
305330,0.161059,0.065049,0.031735,0.018422,0.091387,0.643232,0.025083,0.096990,0.006415,0.000000,...,0.005315,0.0,0.50,0.75,0.0,0.6,0.116202,0.086563,0.178848,0.033336


In [39]:
cnn.predict(x_test)



array([[0.2834317 ],
       [0.9309298 ],
       [0.43703723],
       ...,
       [0.03853424],
       [0.7353629 ],
       [0.60622114]], dtype=float32)

In [34]:
X_submission = data_submission[cols_pred].to_pandas()
X_submission = convert_strings(X_submission)
categorical_cols = X_train.select_dtypes(include=['category']).columns

for col in categorical_cols:
    train_categories = set(X_train[col].cat.categories)
    submission_categories = set(X_submission[col].cat.categories)
    new_categories = submission_categories - train_categories
    X_submission.loc[X_submission[col].isin(new_categories), col] = "Unknown"
    new_dtype = pd.CategoricalDtype(categories=train_categories, ordered=True)
    X_train[col] = X_train[col].astype(new_dtype)
    X_submission[col] = X_submission[col].astype(new_dtype)





In [51]:
x_submission = preprocess_data(X_submission, False).fillna(0)


In [65]:
y_submission_pred = cnn.predict(x_submission)
submission_pred = []

for y in y_submission_pred:
    submission_pred.append(y[0])
    



In [68]:
submission = pd.DataFrame({
    "case_id": data_submission["case_id"].to_numpy(),
    "score": submission_pred
}).set_index('case_id')
submission.to_csv("./submission.csv")

Best of luck, and most importantly, enjoy the process of learning and discovery! 

<img src="https://i.imgur.com/obVWIBh.png" alt="Image" width="700"/>