In [1]:
import mlflow
import datetime
from tqdm import tqdm
from glob import glob
from pathlib import Path
import numpy as np
import pandas as pd
import polars as pl

from lightgbm import LGBMClassifier, LGBMRegressor, early_stopping, log_evaluation
from catboost import  CatBoostClassifier, CatBoostRegressor, Pool
from xgboost import XGBRegressor, XGBClassifier, callback

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedGroupKFold

In [2]:
dt_now = datetime.datetime.now()
run_postfix = dt_now.isoformat()

In [3]:
ROOT            = Path("../data")

TRAIN_DIR       = ROOT / "parquet_files" / "train"
TEST_DIR        = ROOT / "parquet_files" / "test"

TARGET = 'target'
EVAL = True
EXPERIMENT_NAME = "Home_Credit_2024"

In [4]:
class Common_LGB_Modelling:
    """
        Train and test data should contain the same selected features for ML models.
        Train, test data and target should be the same data type. (Pandas or Numpy)
    """
    def __init__(self, config, model_class):
        self.config = config
        self.model_class = model_class
    
    def train(self, x_tr, y_tr):
        
        model = self.model_class(**self.config["params"])
        model.fit(x_tr, y_tr)
    
        return model
    
    def train_and_valid(self, x_tr, y_tr, x_val, y_val):
        
        callbacks = [
                early_stopping(stopping_rounds=self.config["es_round"],
                                   first_metric_only = True), 
                log_evaluation(self.config["verbose_eval"])
            ]
            
        model = self.model_class(**self.config["params"])
        model = model.fit(x_tr, y_tr, 
                          eval_set=[(x_val, y_val)], 
                          callbacks = callbacks)
        valid_pred = model.predict(x_val)
    
        return model, valid_pred
        
    def test(self, models, test):
        test_pred = [model.predict(test) for model in models]
        test_pred = np.mean(test_pred, axis=0)
        return test_pred
    
    def test_by_batch(self, models, test, batch_size):
        test_pred_all = []
        for idx in range(0, len(test), batch_size):
            test_pred_batch = [model.predict(test.iloc[idx:idx+batch_size]) for model in models]
            test_pred_batch = np.mean(test_pred_batch, axis=0)
            test_pred_all.append(test_pred_batch)
        return np.concatenate(test_pred_all)
    
    def numpy_test_by_batch(self, models, test, batch_size):
        test_pred_all = []
        for idx in range(0, len(test), batch_size):
            test_pred_batch = [model.predict(test[idx:idx+batch_size]) for model in models]
            test_pred_batch = np.mean(test_pred_batch, axis=0)
            test_pred_all.append(test_pred_batch)
        return np.concatenate(test_pred_all)

In [23]:
class Common_CB_Modelling:
    """
        Train and test data should contain the same selected features for ML models.
        Train, test data and target should be the same data type. (Pandas or Numpy)
    """
    def __init__(self, config):
        self.config = config

    def train(self, x_tr, y_tr):
        
        train_pool = Pool(data=x_tr,
                          label=y_tr,
                          cat_features=self.config["cat_features"])
                
        if self.config["task_type"] == "classification":
            model = CatBoostClassifier(**self.config["params"])

        elif self.config["task_type"]:
            model = CatBoostRegressor(**self.config["params"])
        model.fit(train_pool)
            
        return model

    
    def train_and_valid(self,  x_tr, y_tr, x_val, y_val):
        
        train_pool = Pool(data=x_tr,
                          label=y_tr,
                          cat_features=self.config["cat_features"])
        
        valid_pool = Pool(data=x_val,
                          label=y_val,
                        cat_features=self.config["cat_features"])
                
        if self.config["task_type"] == "classification":
            model = CatBoostClassifier(**self.config["params"])
            model.fit(train_pool,
                      eval_set=[valid_pool], 
                      early_stopping_rounds=self.config["es_round"],
                      verbose_eval = self.config["verbose_eval"])
            valid_pred = model.predict_proba(x_val)[:,1]
        elif self.config["task_type"]:
            model = CatBoostRegressor(**self.config["params"])
            model.fit(train_pool,
                      eval_set=[valid_pool], 
                      early_stopping_rounds=self.config["es_round"], 
                      verbose_eval = self.config["verbose_eval"])
            valid_pred = model.predict(x_val)
            
        return model, valid_pred
    
    def test(self, models, test):
        if self.config["task_type"] == "classification":
            test_pred = [model.predict_proba(test)[:,1] for model in models]
        else:
            test_pred = [model.predict(test) for model in models]
        test_pred = np.mean(test_pred, axis=0)        
        return test_pred
    
    def test_by_batch(self, models, test, batch_size):
        test_pred_all = []
        for idx in range(0, len(test), batch_size):
            if self.config["task_type"] == "classification":
                test_pred_batch = [model.predict_proba(test.iloc[idx:idx+batch_size])[:,1] for model in models]
            else:
                test_pred_batch = [model.predict(test.iloc[idx:idx+batch_size]) for model in models]            
            test_pred_batch = np.mean(test_pred_batch, axis=0)
            test_pred_all.append(test_pred_batch)
        return np.concatenate(test_pred_all)
    
    def numpy_test_by_batch(self, models, test, batch_size):
        test_pred_all = []
        for idx in range(0, len(test), batch_size):
            if self.config["task_type"] == "classification":
                test_pred_batch = [model.predict_proba(test[idx:idx+batch_size])[:,1] for model in models]
            else:
                test_pred_batch = [model.predict(test[idx:idx+batch_size]) for model in models]            
            test_pred_batch = np.mean(test_pred_batch, axis=0)
            test_pred_all.append(test_pred_batch)
        return np.concatenate(test_pred_all)

In [6]:
class Common_XGB_Modelling:
    """
        Train and test data should contain the same selected features for ML models.
        Train, test data and target should be the same data type. (Pandas or Numpy)
    """
    def __init__(self, config, model_class):
        self.config = config
        self.model_class = model_class
    
    def train_and_valid(self, x_tr, y_tr, x_val, y_val):
                
        model = self.model_class(**self.config["params"])
        model.fit(x_tr, y_tr, eval_set=[(x_val, y_val)])
        valid_pred = model.predict(x_val, iteration_range=(0, model.best_iteration))
    
        return model, valid_pred
    
    def test(self, models, test):
        test_pred = [model.predict(test,  iteration_range=(0, model.best_iteration)) for model in models]
        test_pred = np.mean(test_pred, axis=0)
        return test_pred

In [7]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        if str(col_type)=="category":
            continue
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

------- edit from here -------------

# common fe functions

In [8]:
class Pipeline:
    @staticmethod
    def set_table_dtypes(df): #Standardize the dtype.
        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int64))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
            elif col[-1] in ("D",):
                df = df.with_columns(pl.col(col).cast(pl.Date))            

        return df
    
    @staticmethod
    def handle_dates(df): #Change the feature for D to the difference in days from date_decision.
        for col in df.columns:
            if (col[-1] in ("D",)) and ('count' not in col):
                df = df.with_columns(pl.col(col) - pl.col("date_decision"))
                df = df.with_columns(pl.col(col).dt.total_days())
                
        df = df.drop("date_decision", "MONTH")

        return df
    
    @staticmethod
    def filter_cols(df): #Remove those with an average is_null exceeding 0.95 and those that do not fall within the range 1 < nunique < 200.
        drop_cols = []
        for col in df.columns:
            #if col not in ["target", "case_id", "WEEK_NUM"]:
            #    isnull = df[col].is_null().mean()
            #    if isnull > 0.95:
            #        drop_cols.append(col)

            if (col not in ["target", "case_id", "WEEK_NUM", ]) & (df[col].dtype == pl.String):
                freq = df[col].n_unique()

                if (freq == 1) | (freq > 10):#50 #len(df) * 0.20): # 95 # fe4 down at fq20
                    drop_cols.append(col)
            
            # eliminate yaer, month feature
            # 644
            if (col[-1] not in ["P", "A", "L", "M"]) and (('month_' in col) or ('year_' in col)):# or ('num_group' in col):
            # if (('month_' in col) or ('year_' in col)):# or ('num_group' in col):
                drop_cols.append(col)

        return drop_cols

In [9]:
class Aggregator:
    @staticmethod
    def num_expr(df):
        cols = [col for col in df.columns if (col[-1] in ("T","L","M","D","P","A")) or ("num_group" in col)]

        expr_1 = [pl.max(col).alias(f"max_{col}") for col in cols]
        expr_2 = [pl.min(col).alias(f"min_{col}") for col in cols]
        # expr_3 = [pl.median(col).alias(f"median_{col}") for col in cols]
        # expr_3 = [pl.var(col).alias(f"var_{col}") for col in cols]+ [pl.sum(col).alias(f"sum_{col}") for col in cols]
        # expr_3 = [pl.last(col).alias(f"last_{col}") for col in cols] #+ \
        #     [pl.first(col).alias(f"first_{col}") for col in cols] + \
        #     [pl.mean(col).alias(f"mean_{col}") for col in cols] + \
        #     [pl.std(col).alias(f"std_{col}") for col in cols]
        # expr_3 = [pl.count(col).alias(f"count_{col}") for col in cols]

        cols2 = [col for col in df.columns if col[-1] in ("L", "A")]
        expr_3 = [pl.mean(col).alias(f"mean_{col}") for col in cols2] + \
                 [pl.std(col).alias(f"std_{col}") for col in cols2] + \
                 [pl.sum(col).alias(f"sum_{col}") for col in cols2] + \
                 [pl.median(col).alias(f"median_{col}") for col in cols2] # + \
            # [pl.first(col).alias(f"first_{col}") for col in cols2] + [pl.last(col).alias(f"last_{col}") for col in cols2]
        
        # BAD
        # cols3 = [col for col in df.columns if col[-1] in ("A")]
        # expr_4 = [pl.col(col).fill_null(strategy="zero").apply(lambda x: x.max() - x.min()).alias(f"max-min_gap_{col}") 
        #           for col in cols3]
        return expr_1 + expr_2 + expr_3 # + [pl.col(col).diff().last().alias(f"diff-last_{col}") for col in cols3] # + expr_4
    

    @staticmethod
    def bureau_a1(df):
        cols = [col for col in df.columns if (col[-1] in ("T","L","M","D","P","A")) or ("num_group" in col)]
        expr_1 = [pl.max(col).alias(f"max_{col}") for col in cols]
        expr_2 = [pl.min(col).alias(f"min_{col}") for col in cols]

        cols2 = [
            # bad
        'annualeffectiverate_199L', 'annualeffectiverate_63L',
        'contractsum_5085717L', 
        'credlmt_230A', 'credlmt_935A',
        # 'debtoutstand_525A', 'debtoverdue_47A', 'dpdmax_139P', 'dpdmax_757P',
    #    'instlamount_768A', 'instlamount_852A',
    #    'interestrate_508L', 'monthlyinstlamount_332A',
    #    'monthlyinstlamount_674A', 
            # good?
       'nominalrate_281L', 'nominalrate_498L',
       'numberofcontrsvalue_258L', 'numberofcontrsvalue_358L',
       'numberofinstls_229L', 'numberofinstls_320L',
       'numberofoutstandinstls_520L', 'numberofoutstandinstls_59L',
       'numberofoverdueinstlmax_1039L', 'numberofoverdueinstlmax_1151L',
       'numberofoverdueinstls_725L', 'numberofoverdueinstls_834L',
            # bad?
    #    'outstandingamount_354A', 'outstandingamount_362A', 'overdueamount_31A',
    #    'overdueamount_659A', 'overdueamountmax2_14A', 'overdueamountmax2_398A',
    #    'overdueamountmax_155A', 'overdueamountmax_35A',
        # bad ?
    #    'periodicityofpmts_1102L', 'periodicityofpmts_837L',
    #    'prolongationcount_1120L', 'prolongationcount_599L',
        # 520?
    #    'residualamount_488A', 'residualamount_856A', 'totalamount_6A',
    #    'totalamount_996A', 'totaldebtoverduevalue_178A',
    #    'totaldebtoverduevalue_718A', 'totaloutstanddebtvalue_39A',
    #    'totaloutstanddebtvalue_668A',
       ]

        # .697
        # expr_3 = [pl.mean(col).alias(f"mean_{col}") for col in cols2] + [pl.std(col).alias(f"std_{col}") for col in cols2]
        
        # .696
        # expr_3 = [pl.mean(col).alias(f"mean_{col}") for col in cols2]

        # .697
        # expr_3 = [pl.std(col).alias(f"std_{col}") for col in cols2]
        
        # .6985
        # expr_3 = [pl.sum(col).alias(f"sum_{col}") for col in cols2] + [pl.median(col).alias(f"median_{col}") for col in cols2]

        # .696
        # expr_3 = [pl.sum(col).alias(f"sum_{col}") for col in cols2] 

        # .6981
        # expr_3 = [pl.median(col).alias(f"median_{col}") for col in cols2]

        # .696
        # expr_3 = [pl.first(col).alias(f"first_{col}") for col in cols2] + [pl.last(col).alias(f"last_{col}") for col in cols2] # + \
        
        # .696
        # expr_3 = [pl.std(col).alias(f"std_{col}") for col in cols2] + [pl.median(col).alias(f"median_{col}") for col in cols2]

        # .699
        # expr_3 = [pl.mean(col).alias(f"mean_{col}") for col in cols2] + [pl.std(col).alias(f"std_{col}") for col in cols2] + \
        #     [pl.sum(col).alias(f"sum_{col}") for col in cols2] + [pl.median(col).alias(f"median_{col}") for col in cols2]

        expr_3 = [pl.mean(col).alias(f"mean_{col}") for col in cols2] + [pl.std(col).alias(f"std_{col}") for col in cols2] + \
            [pl.sum(col).alias(f"sum_{col}") for col in cols2] + [pl.median(col).alias(f"median_{col}") for col in cols2] + \
            [pl.first(col).alias(f"first_{col}") for col in cols2] # + [pl.last(col).alias(f"last_{col}") for col in cols2] # not applied
        
        

        # expr_3 = [pl.col(col).fill_null(strategy="zero").apply(lambda x: x.max() - x.min()).alias(f"max-min_gap_depth2_{col}") for col in cols2]
        return expr_1 + expr_2 + expr_3    

    @staticmethod
    def deposit_exprs(df):
        cols = [col for col in df.columns if (col[-1] in ("T","L","M","D","P","A")) or ("num_group" in col)]
        expr_1 = [pl.max(col).alias(f"max_{col}") for col in cols] + [pl.min(col).alias(f"min_{col}") for col in cols] # + \
            # [pl.last(col).alias(f"last_{col}") for col in cols]
            # [pl.mean(col).alias(f"mean_{col}") for col in cols] # + \
            # [pl.std(col).alias(f"std_{col}") for col in cols]  + \
             
            # [pl.last(col).alias(f"last_{col}") for col in cols]
        # expr_2 = [pl.first('openingdate_857D').alias(f'first_openingdate_857D')] + [pl.last('openingdate_857D').alias(f'last_openingdate_857D')]
        
        return expr_1 # + expr_2 #+ expr_ngmax

    @staticmethod
    def debitcard_exprs(df):
        # cols = [col for col in df.columns if (col[-1] in ["A"])]
        cols = [col for col in df.columns if (col[-1] in ("T","L","M","D","P","A")) or ("num_group" in col)]
        expr_1 = [pl.max(col).alias(f"max_{col}") for col in cols] + [pl.min(col).alias(f"min_{col}") for col in cols] 
            # [pl.mean(col).alias(f"mean_{col}") for col in cols] + \
            # [pl.std(col).alias(f"std_{col}") for col in cols]
        # expr_2 = [pl.first('openingdate_857D').alias(f'first_openingdate_857D')] + [pl.last('openingdate_857D').alias(f'last_openingdate_857D')]
        
        return expr_1 # + expr_2 #+ expr_ngmax
        # return expr_1


    @staticmethod
    def person_expr(df):
        cols1 = ['empl_employedtotal_800L', 'empl_employedfrom_271D', 'empl_industry_691L', 
                 'familystate_447L', 'incometype_1044T', 'sex_738L', 'housetype_905L', 'housingtype_772L',
                 'isreference_387L', 'birth_259D', ]
        # cols1 = [col for col in df.columns]
        expr_1 = [pl.first(col).alias(f"first_{col}") for col in cols1]
        
        expr_2 = [pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"), 
                  pl.col("mainoccupationinc_384A").filter(pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployed")]
        
        # No Effect ...
        # cols = ['personindex_1023L', 'persontype_1072L', 'persontype_792L']
        # expr_3 = [pl.col(col).last().alias(f"last_{col}") for col in cols] + [pl.col(col).drop_nulls().mean().alias(f"mean_{col}") for col in cols]

        # cols2 = [col for col in df.columns if col not in cols1]
        # expr_4 = [pl.max(col).alias(f"max_{col}") for col in cols2] + [pl.min(col).alias(f"min_{col}") for col in cols2] #  good at cv, bad at lb ?
            # [pl.col(col).drop_nulls().last().alias(f"last_{col}") for col in cols2] + [pl.col(col).drop_nulls().first().alias(f"first_{col}") for col in cols2] # no effect

        return expr_1 + expr_2 # + expr_4 # + expr_3
    
    @staticmethod
    def person_2_expr(df):
        # cols = [col for col in df.columns]
        cols = ['empls_economicalst_849M', 'empls_employedfrom_796D', 'empls_employer_name_740M'] # + \
            # ['relatedpersons_role_762T', 'conts_role_79M']
            # ['addres_district_368M', 'addres_role_871L', 'addres_zip_823M']

        expr_1 = [pl.first(col).alias(f"first_{col}") for col in cols]
        expr_2 = [pl.last(col).alias(f"last_{col}") for col in cols]

        # BAD
        # expr_ngc = [pl.count("num_group2").alias(f"count_num_group2")]
        # cols2 = [col for col in df.columns if (col in ("num_group1", "num_group2"))]
        # expr_ngmax = [pl.min(col).alias(f"min_{col}") for col in cols2] + [pl.max(col).alias(f"max_{col}") for col in cols2]

        # cols2 = [col for col in df.columns if col not in cols]
        # # expr_3 = [pl.max(col).alias(f"max_{col}") for col in cols2] + [pl.min(col).alias(f"min_{col}") for col in cols2] # no effect
        # expr_3 = [pl.col(col).drop_nulls().last().alias(f"last_{col}") for col in cols2] # no effect

        return expr_1 + expr_2 # + expr_3# + expr_ngc 

    @staticmethod
    def other_expr(df):
        expr_1 = [pl.first(col).alias(f"__other_{col}") for col in df.columns if ('num_group' not in col) and (col != 'case_id')]
        # cols1 = ['amtdepositbalance_4809441A', 'amtdepositincoming_4809444A', 'amtdepositoutgoing_4809442A']
        # expr_1 = [pl.last(col).alias(f"last_{col}") for col in cols1]
        # cols2 = ['amtdebitincoming_4809443A', 'amtdebitoutgoing_4809440A']
        # expr_3 = [(pl.col('amtdebitincoming_4809443A') - pl.col('amtdebitoutgoing_4809440A')).alias('amtdebit_incoming-outgoing')]
        return expr_1 # + expr_2 + expr_3
    
    
    @staticmethod
    def tax_a_exprs(df):
        cols = [col for col in df.columns if (col[-1] in ("T","L","M","D","P","A")) or ("num_group" in col)]
        expr_1 = [pl.max(col).alias(f"max_{col}") for col in cols] + [pl.min(col).alias(f"min_{col}") for col in cols] + \
            [pl.last(col).alias(f"last_{col}") for col in cols] + \
            [pl.first(col).alias(f"first_{col}") for col in cols] + \
            [pl.mean(col).alias(f"mean_{col}") for col in cols] + \
            [pl.std(col).alias(f"std_{col}") for col in cols]
        # expr_1 = [pl.max(col).alias(f"max_{col}") for col in ['amount_4527230A', 'recorddate_4527225D', 'num_group1']] + \
        #     [pl.min(col).alias(f"min_{col}") for col in ['amount_4527230A', 'recorddate_4527225D', ]] + \
        #     [pl.mean(col).alias(f"mean_{col}") for col in ['amount_4527230A']] + \
        #     [pl.std(col).alias(f"std_{col}") for col in ['amount_4527230A']] + \
        #     [pl.last(col).alias(f"last_{col}") for col in ['amount_4527230A', 'recorddate_4527225D', 'name_4527232M']] + \
        #     [pl.first(col).alias(f"first_{col}") for col in ['amount_4527230A', 'recorddate_4527225D', 'name_4527232M']] # BAD?

        expr_4 = [pl.col(col).fill_null(strategy="zero").map_elements(lambda x: x.max() - x.min(), return_dtype=pl.Float32).alias(f"max-min_gap_depth2_{col}") for col in ['amount_4527230A']]

        return expr_1 + expr_4


    @staticmethod
    def bureau_a2(df): # 122만
        # cols = ['collater_valueofguarantee_1124L', 'pmts_dpd_1073P', 'pmts_overdue_1140A',]
        cols = [col for col in df.columns if (col[-1] in ("T","L","M","D","P","A")) or ("num_group" in col)]

        expr_1 = [pl.max(col).alias(f"max_depth2_{col}") for col in cols]
        expr_2 = [pl.min(col).alias(f"min_depth2_{col}") for col in cols]
        expr_3 = [pl.mean(col).alias(f"mean_depth2_{col}") for col in cols] + \
            [pl.std(col).alias(f"std_{col}") for col in cols]
        # expr_ngs = [pl.max(col).alias(f"max_{col}") for col in ['num_group1', 'num_group2', ]]

        expr_4 = [pl.col(col).fill_null(strategy="zero").map_elements(lambda x: x.max() - x.min(), return_dtype=pl.Float32).alias(f"max-min_gap_depth2_{col}") for col in ['collater_valueofguarantee_1124L', 'pmts_dpd_1073P', 'pmts_overdue_1140A',]]

        expr_ngc = [pl.count("num_group2").alias(f"count_depth2_a2_num_group2")]

        # expr_5 = [pl.last(col).alias(f"last_{col}") for col in cols] + \
        #     [pl.first(col).alias(f"first_{col}") for col in cols] + \
        #     [pl.std(col).alias(f"std_{col}") for col in cols]

        return expr_1 + expr_2 + expr_3 + expr_4 + expr_ngc # + expr_5
    
    @staticmethod
    def get_exprs(df):
        exprs = Aggregator.num_expr(df)

        return exprs
    
    # no use from here
    @staticmethod
    def applprev2_exprs(df):
        cols = [col for col in df.columns if "num_group" not in col]
        # expr_1 = [pl.max(col).alias(f"max_{col}") for col in cols] + [pl.min(col).alias(f"min_{col}") for col in cols] 
        expr_2 = [pl.first(col).alias(f"first_{col}") for col in cols]#  + [pl.last(col).alias(f"last_{col}") for col in cols]
        return []#expr_2
    
    @staticmethod
    def bureau_b1(df):  # 0.95 filterにかかるため未使用
        # cols = [col for col in df.columns if (col[-1] in ("T","L","M","D","P","A")) or ("num_group" in col)]

        # expr_1 = [pl.max(col).alias(f"bureau_b1_max_{col}") for col in cols]
        # expr_2 = [pl.min(col).alias(f"bureau_b1_min_{col}") for col in cols]

        # return expr_1 + expr_2 #  + expr_3
        return []
    
    
    @staticmethod
    def bureau_b2(df):  # 0.95filterにかかるため未使用
        # cols = [col for col in df.columns if (col[-1] in ("T","L","M","D","P","A")) or ("num_group" in col)]

        # expr_1 = [pl.max(col).alias(f"bureau_b2_max_{col}") for col in cols]
        # expr_2 = [pl.min(col).alias(f"bureau_b2_min_{col}") for col in cols]

        # return expr_1 + expr_2 #  + expr_3
        return []

    
def agg_by_case(path, df):
    path = str(path)
    if '_applprev_1' in path:
        df = df.sort("num_group1").group_by("case_id").agg(Aggregator.get_exprs(df))

#     elif '_applprev_2' in path:
#         df = df.group_by("case_id").agg(Aggregator.applprev2_exprs(df))

    elif '_credit_bureau_a_1' in path:
        df = df.sort("num_group1").group_by("case_id").agg(Aggregator.bureau_a1(df))

    elif '_credit_bureau_b_1' in path:
        df = df.sort("num_group1").group_by("case_id").agg(Aggregator.bureau_b1(df))

    elif '_deposit_1' in path:
        df = df.sort("num_group1").group_by("case_id").agg(Aggregator.deposit_exprs(df))
    elif '_debitcard_1' in path:
        df = df.sort("num_group1").group_by("case_id").agg(Aggregator.debitcard_exprs(df))
        
    elif '_tax_registry_a' in path:
        df = df.sort("num_group1").group_by("case_id").agg(Aggregator.tax_a_exprs(df))
    elif '_tax_registry_b' in path:
        df = df.sort("num_group1").group_by("case_id").agg(Aggregator.get_exprs(df))
    elif '_tax_registry_c' in path:
        df = df.sort("num_group1").group_by("case_id").agg(Aggregator.get_exprs(df))
        
    elif '_other_1' in path:
        df = df.sort("num_group1").group_by("case_id").agg(Aggregator.other_expr(df))
    elif '_person_1' in path:
        df = df.sort("num_group1").group_by("case_id").agg(Aggregator.person_expr(df))
    elif '_person_2' in path:
        df = df.group_by("case_id").agg(Aggregator.person_2_expr(df))

    elif '_credit_bureau_a_2' in path:
        df = df.group_by("case_id").agg(Aggregator.bureau_a2(df))
    elif '_credit_bureau_b_2' in path:
        df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
    
    return df

def read_file(path, depth=None): 
    df = pl.read_parquet(path)
    df = df.pipe(Pipeline.set_table_dtypes)
    
    if depth in [1, 2]:
        df = agg_by_case(path, df)
    
    return df

def read_files(regex_path, depth=None):
    print(regex_path)
    chunks = []
    for path in glob(str(regex_path)):
        df = pl.read_parquet(path)
        df = df.pipe(Pipeline.set_table_dtypes)
        if depth in [1, 2]:
            df = agg_by_case(path, df)
        chunks.append(df)
        
    df = pl.concat(chunks, how="vertical_relaxed")
    df = df.unique(subset=["case_id"])
    
    return df

def feature_eng(df_base, depth_0, depth_1, depth_2):
    df_base = (
        df_base.with_columns(
            decision_month = pl.col("date_decision").dt.month(),
            decision_weekday = pl.col("date_decision").dt.weekday(),
        )
    )
        
    for i, df in enumerate(depth_0 + depth_1 + depth_2):
        df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
        
    df_base = df_base.pipe(Pipeline.handle_dates)
    return df_base

def to_pandas(df_data, cat_cols=None):
    df_data = df_data.to_pandas()
    print(df_data.info())
    if cat_cols is None:
        cat_cols = list(df_data.select_dtypes("object").columns)
        #cat_cols = [c for c in cat_cols if 'diff_' not in c]
    
    df_data[cat_cols] = df_data[cat_cols].fillna("Missing").astype("category")
    
    return df_data, cat_cols

# train fe

In [10]:
train_data_store = {
    "df_base": read_file(TRAIN_DIR / "train_base.parquet"),
    "depth_0": [
        read_file(TRAIN_DIR / "train_static_cb_0.parquet"),
        read_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        read_files(TRAIN_DIR / "train_credit_bureau_a_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_deposit_1.parquet", 1),
        read_file(TRAIN_DIR / "train_debitcard_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_c_1.parquet", 1),
        read_file(TRAIN_DIR / "train_person_1.parquet", 1),
        read_file(TRAIN_DIR / "train_other_1.parquet", 1),
    ],
    "depth_2": [
        read_files(TRAIN_DIR / "train_credit_bureau_a_2_*.parquet", 2),
        read_file(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
        read_file(TRAIN_DIR / "train_person_2.parquet", 2),
    ]
}

../data/parquet_files/train/train_static_0_*.parquet
../data/parquet_files/train/train_applprev_1_*.parquet
../data/parquet_files/train/train_credit_bureau_a_1_*.parquet
../data/parquet_files/train/train_credit_bureau_a_2_*.parquet


In [11]:
train_overall = feature_eng(**train_data_store)

In [12]:
del train_data_store

# post fe

In [13]:
train_overall, str_features = to_pandas(train_overall)
train_overall.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1526659 entries, 0 to 1526658
Columns: 817 entries, case_id to last_empls_employer_name_740M
dtypes: bool(1), float32(4), float64(660), int64(4), int8(2), object(146)
memory usage: 9.2+ GB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1526659 entries, 0 to 1526658
Columns: 817 entries, case_id to last_empls_employer_name_740M
dtypes: bool(1), category(146), float32(4), float64(660), int64(4), int8(2)
memory usage: 7.9 GB


In [14]:
train_overall = reduce_mem_usage(train_overall)
train_overall.info()

Memory usage of dataframe is 8060.49 MB
Memory usage after optimization is: 2986.56 MB
Decreased by 62.9%
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1526659 entries, 0 to 1526658
Columns: 817 entries, case_id to last_empls_employer_name_740M
dtypes: category(146), float16(416), float32(247), float64(2), int16(1), int32(1), int8(4)
memory usage: 2.9 GB


In [15]:
drop_cols = str_features[2:] #train_overall.pipe(Pipeline.filter_cols)
train_overall = train_overall.drop(drop_cols, axis=1)

In [16]:
selected_features = sorted([i for i in train_overall.columns if i not in ["case_id", "MONTH", "WEEK_NUM", "target"]])
selected_cat_features = [col for i, col in enumerate(selected_features) if col in str_features]

# model config

In [17]:
lgb_config = {    
        "es_round" : 20,
        "verbose_eval": 100,
        "params" : {
            'objective': 'binary', 
            "metric": "auc",
            "n_estimators": 100,#00,
            'learning_rate': 0.02,
            'scale_pos_weight': 10,
            'boosting_type': 'gbdt',
            'verbose': -1,
            'seed': 42,
            'num_leaves': 64, 
            "reg_alpha": 0.1,
            "reg_lambda": 10,
            "cat_smooth": 20,
            "device": "gpu",
        },   
    }
xgb_config = {
        "params" : {
            "n_estimators" : 10,
            'objective': "binary:logistic",
            "eval_metric": "auc",
            "importance_type": "gain",
            "enable_categorical": True,
            'learning_rate': 0.02,
            'scale_pos_weight': 10,
            'booster': 'gbtree',
            'verbosity': 0,
            'seed': 42,
            "reg_alpha": 0.1,
            "reg_lambda": 10,
            "device": "gpu",
            "early_stopping_rounds": 10,
            "verbose_eval": 10,
        },   
}
    
cb_config = {
        "task_type": "classification",
        "es_round" : 20,
        "verbose_eval": 500,
        "cat_features": selected_cat_features,
        "params" : { 
            'random_seed': 42,
            "learning_rate": 0.04,
            'use_best_model': True,
            'iterations': 100,#00,
            'reg_lambda': 10,
            "scale_pos_weight": 10,
            "task_type": "GPU",
            'loss_function': 'Logloss',
        },
    }

In [24]:
lgb_modelling = Common_LGB_Modelling(lgb_config, LGBMClassifier)
cb_modelling = Common_CB_Modelling(cb_config)
xgb_modelling = Common_XGB_Modelling(xgb_config, XGBClassifier)

In [19]:
mlflow.set_tracking_uri("http://mlflow:5000")
mlflow.set_experiment(EXPERIMENT_NAME)

<Experiment: artifact_location='file:///tmp/working/mlruns/836281514613721981', creation_time=1730029393721, experiment_id='836281514613721981', last_update_time=1730029393721, lifecycle_stage='active', name='Home_Credit_2024', tags={}>

In [20]:
y_train = train_overall[TARGET].astype(np.float32).values
train = read_file(TRAIN_DIR / "train_base.parquet")
all_train_week_nums = train.unique("WEEK_NUM").sort("WEEK_NUM").select("WEEK_NUM").to_numpy().reshape(-1)
train_week_df = train.select("WEEK_NUM").to_pandas()
cv = StratifiedGroupKFold(n_splits=5, shuffle=False)
tmp = train_overall[["WEEK_NUM", "target"]]

# modelling

In [21]:
mlflow.lightgbm.autolog(log_input_examples = True, log_datasets=False, silent = True)
lgb_output =np.zeros(len(train_overall))

with mlflow.start_run(run_name = "lgb_"+run_postfix) as run:
    for fold, (tr_idx, val_idx) in enumerate(cv.split(train_overall, y_train, groups=train_week_df)):
        print("Fold :", fold + 1)
        with mlflow.start_run(run_name='fold_'+str(fold+1), nested=True) as child_run:    
            lgb_model, lgb_val_output = lgb_modelling.train_and_valid(train_overall.loc[tr_idx][selected_features], y_train[tr_idx],
                                                              train_overall.loc[val_idx][selected_features], y_train[val_idx])
        lgb_output[val_idx] = lgb_val_output
        
    mlflow.log_metric("overall score", roc_auc_score(tmp["target"].values, lgb_output))

Fold : 1
Training until validation scores don't improve for 20 rounds
[100]	valid_0's auc: 0.831437
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.831437
Evaluated only: auc


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024/11/02 02:23:55 INFO mlflow.tracking._tracking_service.client: 🏃 View run fold_1 at: http://mlflow:5000/#/experiments/836281514613721981/runs/de19e3162d7e45dca256b255d0a80f1a.
2024/11/02 02:23:55 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/836281514613721981.


Fold : 2
Training until validation scores don't improve for 20 rounds
[100]	valid_0's auc: 0.83378
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.83378
Evaluated only: auc


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024/11/02 02:24:29 INFO mlflow.tracking._tracking_service.client: 🏃 View run fold_2 at: http://mlflow:5000/#/experiments/836281514613721981/runs/e513540a9e2443a3a16ba7054b87cf0a.
2024/11/02 02:24:29 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/836281514613721981.


Fold : 3
Training until validation scores don't improve for 20 rounds
[100]	valid_0's auc: 0.837711
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.837711
Evaluated only: auc


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024/11/02 02:25:04 INFO mlflow.tracking._tracking_service.client: 🏃 View run fold_3 at: http://mlflow:5000/#/experiments/836281514613721981/runs/38b26a16ad1c499fb089122261e85626.
2024/11/02 02:25:04 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/836281514613721981.


Fold : 4
Training until validation scores don't improve for 20 rounds
[100]	valid_0's auc: 0.835339
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.835339
Evaluated only: auc


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024/11/02 02:25:40 INFO mlflow.tracking._tracking_service.client: 🏃 View run fold_4 at: http://mlflow:5000/#/experiments/836281514613721981/runs/0c940fb7280b42e89bf37a76bf8a39f6.
2024/11/02 02:25:40 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/836281514613721981.


Fold : 5
Training until validation scores don't improve for 20 rounds
[100]	valid_0's auc: 0.832092
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.832092
Evaluated only: auc


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024/11/02 02:26:15 INFO mlflow.tracking._tracking_service.client: 🏃 View run fold_5 at: http://mlflow:5000/#/experiments/836281514613721981/runs/57203aff73f64f3992421148292a7fce.
2024/11/02 02:26:15 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/836281514613721981.
2024/11/02 02:26:15 INFO mlflow.tracking._tracking_service.client: 🏃 View run lgb_2024-11-02T02:21:33.200150 at: http://mlflow:5000/#/experiments/836281514613721981/runs/5736c0b30650478f9f6e1948e5bb7be1.
2024/11/02 02:26:15 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/836281514613721981.


In [27]:
cb_output =np.zeros(len(train_overall))
with mlflow.start_run(run_name = "cb_"+run_postfix) as run:
    for fold, (tr_idx, val_idx) in enumerate(cv.split(train_overall, y_train, groups=train_week_df)):
        print("Fold :", fold + 1)
        with mlflow.start_run(run_name='fold_'+str(fold+1), nested=True) as child_run:    
            cb_model, cb_val_output = cb_modelling.train_and_valid(
                                                            train_overall.loc[tr_idx][selected_features], y_train[tr_idx],
                                                            train_overall.loc[val_idx][selected_features], y_train[val_idx])
            mlflow.catboost.log_model(cb_model, "artifacts")
            mlflow.log_params(cb_config)
            
            dataset = mlflow.data.from_pandas(train_overall.head(1)[selected_features])
            mlflow.log_input(dataset)
        cb_output[val_idx] = cb_val_output
    mlflow.log_metric("overall score", roc_auc_score(tmp["target"].values, cb_output)) 

Fold : 1
0:	learn: 0.6748175	test: 0.6762859	best: 0.6762859 (0)	total: 54ms	remaining: 5.35s
99:	learn: 0.4325537	test: 0.4502522	best: 0.4502522 (99)	total: 5.35s	remaining: 0us
bestTest = 0.4502522288
bestIteration = 99


2024/11/02 02:36:55 INFO mlflow.tracking._tracking_service.client: 🏃 View run fold_1 at: http://mlflow:5000/#/experiments/836281514613721981/runs/be07e62c1d6d4b31b636d5fafb49a957.
2024/11/02 02:36:55 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/836281514613721981.


Fold : 2
0:	learn: 0.6752945	test: 0.6749283	best: 0.6749283 (0)	total: 53.2ms	remaining: 5.27s
99:	learn: 0.4367613	test: 0.4338011	best: 0.4338011 (99)	total: 5.45s	remaining: 0us
bestTest = 0.4338010622
bestIteration = 99


2024/11/02 02:38:44 INFO mlflow.tracking._tracking_service.client: 🏃 View run fold_2 at: http://mlflow:5000/#/experiments/836281514613721981/runs/c4a18a52d1684c298ecde3882cf26ac0.
2024/11/02 02:38:44 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/836281514613721981.


Fold : 3
0:	learn: 0.6755158	test: 0.6750716	best: 0.6750716 (0)	total: 51.8ms	remaining: 5.13s
99:	learn: 0.4370742	test: 0.4315417	best: 0.4315417 (99)	total: 5.41s	remaining: 0us
bestTest = 0.4315417353
bestIteration = 99


2024/11/02 02:40:32 INFO mlflow.tracking._tracking_service.client: 🏃 View run fold_3 at: http://mlflow:5000/#/experiments/836281514613721981/runs/879bd0237f684ed0a7a739d82b7f52d7.
2024/11/02 02:40:32 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/836281514613721981.


Fold : 4
0:	learn: 0.6751605	test: 0.6752585	best: 0.6752585 (0)	total: 55.9ms	remaining: 5.54s
99:	learn: 0.4355594	test: 0.4380299	best: 0.4380299 (99)	total: 5.56s	remaining: 0us
bestTest = 0.4380298725
bestIteration = 99


2024/11/02 02:42:22 INFO mlflow.tracking._tracking_service.client: 🏃 View run fold_4 at: http://mlflow:5000/#/experiments/836281514613721981/runs/9c6a31204cd347a4897af54da4f3cd96.
2024/11/02 02:42:22 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/836281514613721981.


Fold : 5
0:	learn: 0.6752273	test: 0.6747053	best: 0.6747053 (0)	total: 52.9ms	remaining: 5.24s
99:	learn: 0.4360877	test: 0.4364281	best: 0.4364281 (99)	total: 5.41s	remaining: 0us
bestTest = 0.436428061
bestIteration = 99


2024/11/02 02:44:10 INFO mlflow.tracking._tracking_service.client: 🏃 View run fold_5 at: http://mlflow:5000/#/experiments/836281514613721981/runs/9418484785b84a27a9f9f5b0e5e317ca.
2024/11/02 02:44:10 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/836281514613721981.
2024/11/02 02:44:10 INFO mlflow.tracking._tracking_service.client: 🏃 View run cb_2024-11-02T02:21:33.200150 at: http://mlflow:5000/#/experiments/836281514613721981/runs/901b38088617414d940f4b6cd3b44c7f.
2024/11/02 02:44:10 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/836281514613721981.


mlflow.xgboost.autolog(log_input_examples = True, log_datasets=False, silent = True)
xgb_output =np.zeros(len(train_overall))

with mlflow.start_run(run_name = "xgb_"+run_postfix) as run:
    for fold, (tr_idx, val_idx) in enumerate(cv.split(train_overall, y_train, groups=train_week_df)):
        print("Fold :", fold + 1)
        with mlflow.start_run(run_name='fold_'+str(fold+1), nested=True) as child_run:    
            xgb_model, xgb_val_output = treemodel.xgb_train_and_valid(train_overall.loc[tr_idx][selected_features], y_train[tr_idx],
                                                              train_overall.loc[val_idx][selected_features], y_train[val_idx],
                                                              config["lgb"]["params"])
        xgb_output[val_idx] = xgb_val_output
        
    mlflow.log_metric("overall score", roc_auc_score(tmp["target"].values, xgb_output))