# Example Notebook

Welcome to the example notebook for the Home Credit Kaggle competition. The goal of this competition is to determine how likely a customer is going to default on an issued loan. The main difference between the [first](https://www.kaggle.com/c/home-credit-default-risk) and this competition is that now your submission will be scored with a custom metric that will take into account how well the model performs in future. A decline in performance will be penalized. The goal is to create a model that is stable and performs well in the future.

In this notebook you will see how to:
* Load the data
* Join tables with Polars - a DataFrame library implemented in Rust language, designed to be blazingy fast and memory efficient.  
* Create simple aggregation features
* Train a LightGBM model
* Create a submission table

## Load the data

In [24]:
import polars as pl
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import lightgbm as lgb
import gc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score 

dataPath = "/kaggle/input/home-credit-credit-risk-model-stability/"

In [2]:
def set_table_dtypes(df: pl.DataFrame) -> pl.DataFrame:
    # implement here all desired dtypes for tables
    # the following is just an example
    for col in df.columns:
        # last letter of column name will help you determine the type
        if col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))

    return df

def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:  
        if df[col].dtype.name in ['object', 'string']:
            df[col] = df[col].astype("string").astype('category')
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype)
    return df

In [3]:
train_basetable = pl.read_csv(dataPath + "csv_files/train/train_base.csv")
train_static = pl.concat(
    [
        pl.read_csv(dataPath + "csv_files/train/train_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/train/train_static_0_1.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
train_static_cb = pl.read_csv(dataPath + "csv_files/train/train_static_cb_0.csv").pipe(set_table_dtypes)
train_person_1 = pl.read_csv(dataPath + "csv_files/train/train_person_1.csv").pipe(set_table_dtypes) 
train_credit_bureau_b_2 = pl.read_csv(dataPath + "csv_files/train/train_credit_bureau_b_2.csv").pipe(set_table_dtypes) 

In [4]:
test_basetable = pl.read_csv(dataPath + "csv_files/test/test_base.csv")
test_static = pl.concat(
    [
        pl.read_csv(dataPath + "csv_files/test/test_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/test/test_static_0_1.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/test/test_static_0_2.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
test_static_cb = pl.read_csv(dataPath + "csv_files/test/test_static_cb_0.csv").pipe(set_table_dtypes)
test_person_1 = pl.read_csv(dataPath + "csv_files/test/test_person_1.csv").pipe(set_table_dtypes) 
test_credit_bureau_b_2 = pl.read_csv(dataPath + "csv_files/test/test_credit_bureau_b_2.csv").pipe(set_table_dtypes) 

## Feature engineering

In this part, we can see a simple example of joining tables via `case_id`. Here the loading and joining is done with polars library. Polars library is blazingly fast and has much smaller memory footprint than pandas. 

In [5]:
# We need to use aggregation functions in tables with depth > 1, so tables that contain num_group1 column or 
# also num_group2 column.
train_person_1_feats_1 = train_person_1.group_by("case_id").agg(
    pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
    (pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployed")
)

# Here num_group1=0 has special meaning, it is the person who applied for the loan.
train_person_1_feats_2 = train_person_1.select(["case_id", "num_group1", "housetype_905L"]).filter(
    pl.col("num_group1") == 0
).drop("num_group1").rename({"housetype_905L": "person_housetype"})

# Here we have num_goup1 and num_group2, so we need to aggregate again.
train_credit_bureau_b_2_feats = train_credit_bureau_b_2.group_by("case_id").agg(
    pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
    (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31")
)

# We will process in this examples only A-type and M-type columns, so we need to select them.
selected_static_cols = []
for col in train_static.columns:
    if col[-1] in ("A", "M"):
        selected_static_cols.append(col)
print(selected_static_cols)

selected_static_cb_cols = []
for col in train_static_cb.columns:
    if col[-1] in ("A", "M"):
        selected_static_cb_cols.append(col)
print(selected_static_cb_cols)

# Join all tables together.
data = train_basetable.join(
    train_static.select(["case_id"]+selected_static_cols), how="left", on="case_id"
).join(
    train_static_cb.select(["case_id"]+selected_static_cb_cols), how="left", on="case_id"
).join(
    train_person_1_feats_1, how="left", on="case_id"
).join(
    train_person_1_feats_2, how="left", on="case_id"
).join(
    train_credit_bureau_b_2_feats, how="left", on="case_id"
)

del train_basetable
del train_static_cb
del train_person_1_feats_2
del train_credit_bureau_b_2_feats
gc.collect()

['amtinstpaidbefduel24m_4187115A', 'annuity_780A', 'annuitynextmonth_57A', 'avginstallast24m_3658937A', 'avglnamtstart24m_4525187A', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'credamount_770A', 'currdebt_22A', 'currdebtcredtyperange_828A', 'disbursedcredamount_1113A', 'downpmt_116A', 'inittransactionamount_650A', 'lastapprcommoditycat_1041M', 'lastapprcommoditytypec_5251766M', 'lastapprcredamount_781A', 'lastcancelreason_561M', 'lastotherinc_902A', 'lastotherlnsexpense_631A', 'lastrejectcommoditycat_161M', 'lastrejectcommodtypec_5251769M', 'lastrejectcredamount_222A', 'lastrejectreason_759M', 'lastrejectreasonclient_4145040M', 'maininc_215A', 'maxannuity_159A', 'maxannuity_4075009A', 'maxdebt4_972A', 'maxinstallast24m_3658928A', 'maxlnamtstart6m_4525199A', 'maxoutstandbalancel12m_4187113A', 'maxpmtlast3m_4525190A', 'previouscontdistrict_112M', 'price_1097A', 'sumoutstandtotal_3546847A', 'sumoutstandtotalest_4493215A', 'totaldebt_9A', 'totalsettled_863A', 'totinstallas

0

In [6]:
test_person_1_feats_1 = test_person_1.group_by("case_id").agg(
    pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
    (pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployed")
)

test_person_1_feats_2 = test_person_1.select(["case_id", "num_group1", "housetype_905L"]).filter(
    pl.col("num_group1") == 0
).drop("num_group1").rename({"housetype_905L": "person_housetype"})

test_credit_bureau_b_2_feats = test_credit_bureau_b_2.group_by("case_id").agg(
    pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
    (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31")
)

data_submission = test_basetable.join(
    test_static.select(["case_id"]+selected_static_cols), how="left", on="case_id"
).join(
    test_static_cb.select(["case_id"]+selected_static_cb_cols), how="left", on="case_id"
).join(
    test_person_1_feats_1, how="left", on="case_id"
).join(
    test_person_1_feats_2, how="left", on="case_id"
).join(
    test_credit_bureau_b_2_feats, how="left", on="case_id"
)

In [7]:
case_ids = data["case_id"].unique().shuffle(seed=1)
case_ids_train, case_ids_test = train_test_split(case_ids, train_size=0.6, random_state=1)
case_ids_valid, case_ids_test = train_test_split(case_ids_test, train_size=0.5, random_state=1)

cols_pred = []
for col in data.columns:
    if col[-1].isupper() and col[:-1].islower():
        cols_pred.append(col)

print(cols_pred)

def from_polars_to_pandas(case_ids: pl.DataFrame) -> pl.DataFrame:
    return (
        data.filter(pl.col("case_id").is_in(case_ids))[["case_id", "WEEK_NUM", "target"]].to_pandas(),
        data.filter(pl.col("case_id").is_in(case_ids))[cols_pred].to_pandas(),
        data.filter(pl.col("case_id").is_in(case_ids))["target"].to_pandas()
    )

base_train, X_train, y_train = from_polars_to_pandas(case_ids_train)
base_valid, X_valid, y_valid = from_polars_to_pandas(case_ids_valid)
base_test, X_test, y_test = from_polars_to_pandas(case_ids_test)

for df in [X_train, X_valid, X_test]:
    df = convert_strings(df)

['amtinstpaidbefduel24m_4187115A', 'annuity_780A', 'annuitynextmonth_57A', 'avginstallast24m_3658937A', 'avglnamtstart24m_4525187A', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'credamount_770A', 'currdebt_22A', 'currdebtcredtyperange_828A', 'disbursedcredamount_1113A', 'downpmt_116A', 'inittransactionamount_650A', 'lastapprcommoditycat_1041M', 'lastapprcommoditytypec_5251766M', 'lastapprcredamount_781A', 'lastcancelreason_561M', 'lastotherinc_902A', 'lastotherlnsexpense_631A', 'lastrejectcommoditycat_161M', 'lastrejectcommodtypec_5251769M', 'lastrejectcredamount_222A', 'lastrejectreason_759M', 'lastrejectreasonclient_4145040M', 'maininc_215A', 'maxannuity_159A', 'maxannuity_4075009A', 'maxdebt4_972A', 'maxinstallast24m_3658928A', 'maxlnamtstart6m_4525199A', 'maxoutstandbalancel12m_4187113A', 'maxpmtlast3m_4525190A', 'previouscontdistrict_112M', 'price_1097A', 'sumoutstandtotal_3546847A', 'sumoutstandtotalest_4493215A', 'totaldebt_9A', 'totalsettled_863A', 'totinstallas

In [9]:
base_train, base_valid, base_test

(        case_id  WEEK_NUM  target
 0             0         0       0
 1             2         0       0
 2             5         0       0
 3             6         0       0
 4             7         0       0
 ...         ...       ...     ...
 915990  2703449        91       0
 915991  2703450        91       0
 915992  2703452        91       0
 915993  2703453        91       0
 915994  2703454        91       0
 
 [915995 rows x 3 columns],
         case_id  WEEK_NUM  target
 0             1         0       0
 1             3         0       0
 2             9         0       0
 3            11         0       0
 4            13         0       0
 ...         ...       ...     ...
 305327  2703419        91       0
 305328  2703424        91       0
 305329  2703432        91       0
 305330  2703434        91       0
 305331  2703451        91       0
 
 [305332 rows x 3 columns],
         case_id  WEEK_NUM  target
 0             4         0       1
 1            10         0    

In [12]:
X_train

Unnamed: 0,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,credamount_770A,currdebt_22A,currdebtcredtyperange_828A,...,totinstallast1m_4525188A,description_5085714M,education_1103M,education_88M,maritalst_385M,maritalst_893M,pmtaverage_3A,pmtaverage_4527227A,pmtaverage_4955615A,pmtssum_45A
0,,1917.6000,0.0,,,,,30000.0,0.0,0.0,...,,,,,,,,,,
1,,4937.0000,0.0,,,,,78000.0,0.0,0.0,...,,,,,,,,,,
2,,3600.0000,0.0,,,,,60000.0,0.0,0.0,...,,,,,,,,,,
3,,3110.8000,0.0,,,,,20000.0,0.0,0.0,...,,,,,,,,,,
4,,1218.0000,0.0,,,,,20300.0,0.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915990,104557.25,8218.0000,5293.2,8027.2000,,23399.828,5293.2,132000.0,10548.4,10548.4,...,10586.400,2fc785b2,a55475b1,a55475b1,a55475b1,a55475b1,,,,
915991,176561.36,3675.4001,0.0,7356.8003,,16392.496,6750.2,30000.0,0.0,0.0,...,14346.319,2fc785b2,a55475b1,a55475b1,a55475b1,a55475b1,,,12155.0,
915992,14232.40,7788.8003,0.0,2662.4001,,,1500.6,60000.0,0.0,0.0,...,,2fc785b2,a55475b1,a55475b1,a55475b1,a55475b1,,,,
915993,197371.58,1195.4000,2827.2,8212.6010,,47943.062,9921.2,6000.0,46806.6,46806.6,...,5654.400,2fc785b2,6b2ae0fa,a55475b1,3439d993,a55475b1,,,15792.4,


In [13]:
print(f"Train: {X_train.shape}")
print(f"Valid: {X_valid.shape}")
print(f"Test: {X_test.shape}")

Train: (915995, 48)
Valid: (305332, 48)
Test: (305332, 48)


In [14]:
cols = X_train.loc[:, X_train.iloc[0].map(type) == str].columns.unique()
cols

Index(['lastapprcommoditycat_1041M', 'lastapprcommoditytypec_5251766M',
       'lastcancelreason_561M', 'lastrejectcommoditycat_161M',
       'lastrejectcommodtypec_5251769M', 'lastrejectreason_759M',
       'lastrejectreasonclient_4145040M', 'previouscontdistrict_112M'],
      dtype='object')

In [15]:
X_train = X_train.fillna(X_train.mode().iloc[0])
X_valid = X_valid.fillna(X_train.mode().iloc[0])
X_test = X_test.fillna(X_train.mode().iloc[0])

C_X_train = X_train.copy()
C_X_valid = X_valid.copy()
C_X_test = X_test.copy()

In [16]:
for col in cols:
    cat_list = {*X_train[col], *X_valid[col], *X_test[col]}
    X_train[col] = X_train[col].astype(pd.CategoricalDtype(cat_list))
    X_valid[col] = X_valid[col].astype(pd.CategoricalDtype(cat_list))
    X_test[col] = X_test[col].astype(pd.CategoricalDtype(cat_list))

X_train = pd.get_dummies(X_train,drop_first=True)
X_valid = pd.get_dummies(X_valid,drop_first=True)
X_test = pd.get_dummies(X_test,drop_first=True)


In [17]:
print(f"Train: {X_train.shape}")
print(f"Valid: {X_valid.shape}")
print(f"Test: {X_test.shape}")

Train: (915995, 871)
Valid: (305332, 871)
Test: (305332, 871)


In [18]:
X_train

Unnamed: 0,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,credamount_770A,currdebt_22A,currdebtcredtyperange_828A,...,maritalst_385M_a7fcb6e5,maritalst_385M_b6cabe76,maritalst_385M_ecd83604,maritalst_385M_Unknown,maritalst_893M_46b968c3,maritalst_893M_977b2a70,maritalst_893M_a55475b1,maritalst_893M_e18430ff,maritalst_893M_ecd83604,maritalst_893M_Unknown
0,0.00,1917.6000,0.0,800.0000,100000.0,0.000,0.0,30000.0,0.0,0.0,...,False,False,False,False,False,False,True,False,False,False
1,0.00,4937.0000,0.0,800.0000,100000.0,0.000,0.0,78000.0,0.0,0.0,...,False,False,False,False,False,False,True,False,False,False
2,0.00,3600.0000,0.0,800.0000,100000.0,0.000,0.0,60000.0,0.0,0.0,...,False,False,False,False,False,False,True,False,False,False
3,0.00,3110.8000,0.0,800.0000,100000.0,0.000,0.0,20000.0,0.0,0.0,...,False,False,False,False,False,False,True,False,False,False
4,0.00,1218.0000,0.0,800.0000,100000.0,0.000,0.0,20300.0,0.0,0.0,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915990,104557.25,8218.0000,5293.2,8027.2000,100000.0,23399.828,5293.2,132000.0,10548.4,10548.4,...,False,False,False,False,False,False,True,False,False,False
915991,176561.36,3675.4001,0.0,7356.8003,100000.0,16392.496,6750.2,30000.0,0.0,0.0,...,False,False,False,False,False,False,True,False,False,False
915992,14232.40,7788.8003,0.0,2662.4001,100000.0,0.000,1500.6,60000.0,0.0,0.0,...,False,False,False,False,False,False,True,False,False,False
915993,197371.58,1195.4000,2827.2,8212.6010,100000.0,47943.062,9921.2,6000.0,46806.6,46806.6,...,False,False,False,False,False,False,True,False,False,False


In [19]:
X_train.duplicated().sum()

10338

In [None]:
# X_train=X_train.drop_duplicates()
# idx = (set(X_train.index) ^ set(y_train.index))
# y_train = y_train.drop(index = idx)
# X_valid=X_valid.drop_duplicates()
# idx = (set(X_valid.index) ^ set(y_valid.index))
# y_valid = y_valid.drop(index = idx)

In [20]:
features = X_train.columns

n = 300
pca = PCA(n_components = n)
columns = [f'feature_{i+1}' for i in range(n)]
X_train = pd.DataFrame(pca.fit_transform(X_train[features]), columns=columns)
X_valid = pd.DataFrame(pca.transform(X_valid[features]), columns=columns)
X_test = pd.DataFrame(pca.transform(X_test[features]), columns=columns)

In [26]:
scaler = MinMaxScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=columns)
X_valid = pd.DataFrame(scaler.transform(X_valid), columns=columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=columns)
X_train

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_291,feature_292,feature_293,feature_294,feature_295,feature_296,feature_297,feature_298,feature_299,feature_300
0,0.000277,0.091155,0.400102,0.193267,0.068044,0.304871,0.328453,0.498275,0.318867,0.467329,...,0.472439,0.411469,0.295692,0.391674,0.230329,0.250658,0.314526,0.299566,0.251718,0.490986
1,0.000417,0.091152,0.399982,0.183654,0.068696,0.247462,0.324989,0.502533,0.316148,0.470821,...,0.472626,0.411904,0.295655,0.391635,0.230340,0.250573,0.314579,0.299639,0.251737,0.490873
2,0.000283,0.091154,0.400099,0.193141,0.068031,0.304808,0.328262,0.499150,0.319010,0.467734,...,0.472553,0.411318,0.295571,0.391723,0.230545,0.251258,0.314505,0.299035,0.251887,0.490797
3,0.000288,0.091194,0.400020,0.198427,0.067280,0.258452,0.322301,0.495585,0.318355,0.466259,...,0.473676,0.412782,0.294992,0.392739,0.230322,0.250178,0.314847,0.300011,0.251854,0.491233
4,0.000236,0.091170,0.400112,0.198348,0.067542,0.307451,0.327331,0.496156,0.319649,0.465897,...,0.472427,0.411477,0.295783,0.391750,0.230364,0.250637,0.314498,0.299523,0.251767,0.491015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915990,0.006116,0.091976,0.402165,0.163749,0.063309,0.351011,0.351690,0.508166,0.321496,0.475945,...,0.471813,0.411022,0.294929,0.392631,0.230085,0.252430,0.313096,0.295219,0.249969,0.489734
915991,0.011571,0.096547,0.397402,0.191932,0.058584,0.342150,0.348578,0.485830,0.317424,0.470429,...,0.435897,0.419397,0.285658,0.416641,0.247446,0.258560,0.329524,0.320530,0.237567,0.455936
915992,0.001037,0.091609,0.399932,0.185914,0.066510,0.296644,0.378744,0.503699,0.323500,0.471398,...,0.471478,0.413034,0.294669,0.393851,0.230646,0.250883,0.313896,0.300313,0.247651,0.488617
915993,0.013571,0.094408,0.400336,0.204939,0.060243,0.261707,0.322110,0.483010,0.310435,0.478524,...,0.476119,0.402451,0.307697,0.417687,0.224211,0.262504,0.296968,0.255866,0.233369,0.487045


Hottie = OneHotEncoder()
# let's define the column we want to convert
features_cat = ['Comapnay','model']
transformer = ColumnTransformer([('One_hottie',Hottie,features_cat)],
                            remainder = 'passthrough')
transformed_X = transformer.fit_transform(X)
Transformed_X = pd.DataFrame(transformed_X)

## Training LightGBM

Minimal example of LightGBM training is shown below.

In [33]:
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_valid = lgb.Dataset(X_valid, label=y_valid, reference=lgb_train)

params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 10,
    "num_leaves": 64,
    "n_estimators":1000,
    "learning_rate": 0.04,#0.05
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "random_state": 42,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "verbose": -1,
    "device": "gpu",
}
results = {}
gbm = lgb.train(
    params,
    lgb_train,
    valid_sets=lgb_valid,
#     num_boost_round = 1000,
#     callbacks=[lgb.log_evaluation(50), lgb.early_stopping(20), lgb.record_evaluation(results)]
    callbacks=[lgb.log_evaluation(25), lgb.early_stopping(50)]
)



Training until validation scores don't improve for 10 rounds
[25]	valid_0's auc: 0.716173
[50]	valid_0's auc: 0.726937
[75]	valid_0's auc: 0.733536
[100]	valid_0's auc: 0.737521
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.737521


Evaluation with AUC and then comparison with the stability metric is shown below.

import matplotlib.pyplot as plt
plt.plot(results['X_train']['auc'], label='train')
plt.plot(results['X_valid']['auc'], label='valid')
plt.ylabel('Log loss')
plt.xlabel('Boosting round')
plt.title('Training performance')
plt.legend()
plt.show()

In [None]:
for base, X in [(base_train, X_train), (base_valid, X_valid), (base_test, X_test)]:
    y_pred = gbm.predict(X, num_iteration=gbm.best_iteration)
    base["score"] = y_pred

print(f'The AUC score on the train set is: {roc_auc_score(base_train["target"], base_train["score"])}') 
print(f'The AUC score on the valid set is: {roc_auc_score(base_valid["target"], base_valid["score"])}') 
print(f'The AUC score on the test set is: {roc_auc_score(base_test["target"], base_test["score"])}')  

In [None]:
def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]]\
        .sort_values("WEEK_NUM")\
        .groupby("WEEK_NUM")[["target", "score"]]\
        .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a*x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

stability_score_train = gini_stability(base_train)
stability_score_valid = gini_stability(base_valid)
stability_score_test = gini_stability(base_test)

print(f'The stability score on the train set is: {stability_score_train}') 
print(f'The stability score on the valid set is: {stability_score_valid}') 
print(f'The stability score on the test set is: {stability_score_test}')  

## Submission

Scoring the submission dataset is below, we need to take care of new categories. Then we save the score as a last step. 

In [None]:
X_submission = data_submission[cols_pred].to_pandas()
X_submission = convert_strings(X_submission)
X_submission = X_submission.fillna(X_train.mode().iloc[0])
for col in cols:
    cat_list = {*C_X_train[col], *C_X_valid[col], *C_X_test[col]}
    X_submission[col] = X_submission[col].astype(pd.CategoricalDtype(cat_list))

del C_X_train
del C_X_valid
del C_X_test
gc.collect()
    
X_submission = pd.get_dummies(X_submission)
diff = list(set(X_submission.columns) ^ set(X_train.columns))
X_submission.loc[:,diff] = False
columns = X_train.columns
X_submission = X_submission.reindex(columns=columns)

del X_train
del X_valid
del X_test
gc.collect()

y_submission_pred = gbm.predict(X_submission, num_iteration=gbm.best_iteration)

X_submission = data_submission[cols_pred].to_pandas()
X_submission = convert_strings(X_submission)
categorical_cols = X_train.select_dtypes(include=['category']).columns

for col in categorical_cols:
    train_categories = set(X_train[col].cat.categories)
    submission_categories = set(X_submission[col].cat.categories)
    new_categories = submission_categories - train_categories
    X_submission.loc[X_submission[col].isin(new_categories), col] = "Unknown"
    new_dtype = pd.CategoricalDtype(categories=train_categories, ordered=True)
    X_train[col] = X_train[col].astype(new_dtype)
    X_submission[col] = X_submission[col].astype(new_dtype)

y_submission_pred = gbm.predict(X_submission, num_iteration=gbm.best_iteration)

In [None]:
submission = pd.DataFrame({
    "case_id": data_submission["case_id"].to_numpy(),
    "score": y_submission_pred
}).set_index('case_id')
submission.to_csv("./submission.csv")

Best of luck, and most importantly, enjoy the process of learning and discovery! 

<img src="https://i.imgur.com/obVWIBh.png" alt="Image" width="700"/>