## Baseline

https://www.kaggle.com/code/pxcai666/catboost-lightgbm-ensemble/notebook

### Configuration

In [1]:
import sys
from pathlib import Path
import subprocess
import os
import gc
from glob import glob

import numpy as np
import pandas as pd
import polars as pl
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit, GroupKFold, StratifiedGroupKFold
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier, Pool
import lightgbm as lgb

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer



In [2]:
ROOT = "/kaggle/input/home-credit-credit-risk-model-stability"

TRAIN_DIR = os.path.join(ROOT, "parquet_files", "train")
TEST_DIR = os.path.join(ROOT, "parquet_files", "test")

In [3]:
device = "gpu"

In [4]:
class Pipeline:
    @staticmethod
    def set_table_dtypes(df: pl.DataFrame):
        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int64))
            elif col in ["date_decision"] or col.endswith("D"):
                df = df.with_columns(pl.col(col).str.strptime(pl.Date, "%Y-%m-%d"))
            elif col.endswith(("P", "A")):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
            elif col.endswith("M"):
                df = df.with_columns(pl.col(col).cast(pl.Utf8))

        return df

    @staticmethod
    def handle_dates(df: pl.DataFrame) -> pl.DataFrame:
        """日付関連の列を処理

        - date_decisionからの日数差分を取得
        """
        for col in df.columns:
            if col.endswith("D"):
                df = df.with_columns((pl.col(col) - pl.col("date_decision")).dt.days().alias(col))

        df = df.drop("date_decision", "MONTH")

        return df

    @staticmethod
    def filter_cols(df: pl.DataFrame):
        for col in df.columns:
            if col not in ["target", "case_id", "WEEK_NUM"]:
                isnull = df[col].is_null().mean()
                if isnull > 0.7:
                    df = df.drop(col)

        for col in df.columns:
            if (col not in ["target", "case_id", "WEEK_NUM"]) & (df[col].dtype == pl.Utf8):
                freq = df[col].n_unique()
                if (freq == 1) | (freq > 200):
                    df = df.drop(col)

        return df

In [5]:

class Aggregator:
    @staticmethod
    def get_exprs(df):
        exprs = (
            Aggregator.num_expr(df)
            + Aggregator.date_expr(df)
            + Aggregator.str_expr(df)
            + Aggregator.other_expr(df)
            + Aggregator.count_expr(df)
        )

        return exprs

    @staticmethod
    def num_expr(df):
        cols = [col for col in df.columns if col.endswith(("P", "A"))]

        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        #expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]

        return expr_max + expr_last + expr_mean

    @staticmethod
    def date_expr(df):
        cols = [col for col in df.columns if col.endswith("D")]

        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        #expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        #expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]

        return expr_max + expr_last + expr_mean

    @staticmethod
    def str_expr(df):
        cols = [col for col in df.columns if col.endswith("M")]

        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        #expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        #expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        #expr_count = [pl.count(col).alias(f"count_{col}") for col in cols]

        return expr_max + expr_last

    @staticmethod
    def other_expr(df):
        cols = [col for col in df.columns if col.endswith(("T", "L"))]

        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        #expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        #expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]

        return expr_max + expr_last


    @staticmethod
    def count_expr(df):
        cols = [col for col in df.columns if "num_group" in col]

        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        #expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        #expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]

        return expr_max + expr_last

### Read data

In [6]:
def read_file(path, depth=None):
  df = pl.read_parquet(path)
  df = df.pipe(Pipeline.set_table_dtypes)

  # depth>0 は集計値を使わないと、left join時にOOMになる（RAM 64GBでも)
  if depth in [1, 2]:
    df = df.group_by("case_id").agg(Aggregator.get_exprs(df))

  return df

def read_files(regex_path, depth=None):
  chunks = []
  for path in glob(str(regex_path)):
      df = pl.read_parquet(path)
      df = df.pipe(Pipeline.set_table_dtypes)
      if depth in [1, 2]:
          df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
      chunks.append(df)

  df = pl.concat(chunks, how="vertical_relaxed")
  df = df.unique(subset=["case_id"])

  return df

In [7]:
%%time
data_store = {
    "base": read_file(os.path.join(TRAIN_DIR, "train_base.parquet")),
    "depth_0": [
        read_file(os.path.join(TRAIN_DIR, "train_static_cb_0.parquet")),
        read_files(os.path.join(TRAIN_DIR, "train_static_0_*.parquet")),
    ],
    "depth_1": [
        read_files(os.path.join(TRAIN_DIR, "train_applprev_1_*.parquet"), 1),
        read_file(os.path.join(TRAIN_DIR, "train_tax_registry_a_1.parquet"), 1),
        read_file(os.path.join(TRAIN_DIR, "train_tax_registry_b_1.parquet"), 1),
        read_file(os.path.join(TRAIN_DIR, "train_tax_registry_c_1.parquet"), 1),
        read_files(os.path.join(TRAIN_DIR, "train_credit_bureau_a_1_*.parquet"), 1),
        read_file(os.path.join(TRAIN_DIR, "train_credit_bureau_b_1.parquet"), 1),
        read_file(os.path.join(TRAIN_DIR, "train_other_1.parquet"), 1),
        read_file(os.path.join(TRAIN_DIR, "train_person_1.parquet"), 1),
        read_file(os.path.join(TRAIN_DIR, "train_deposit_1.parquet"), 1),
        read_file(os.path.join(TRAIN_DIR, "train_debitcard_1.parquet"), 1),
    ],
    "depth_2": [
        read_file(os.path.join(TRAIN_DIR, "train_credit_bureau_b_2.parquet"), 2),
        read_files(os.path.join(TRAIN_DIR, "train_credit_bureau_a_2_*.parquet"), 2),
        read_file(os.path.join(TRAIN_DIR, "train_applprev_2.parquet"), 2),
        read_file(os.path.join(TRAIN_DIR, "train_person_2.parquet"), 2)
    ],
}

CPU times: user 2min 50s, sys: 25.3 s, total: 3min 15s
Wall time: 14.8 s


### Def utility functions

In [8]:
def to_pandas(df, cat_cols=None):
    df = df.to_pandas()
    if cat_cols is None:
        cat_cols = list(df.select_dtypes("object").columns)
    df[cat_cols] = df[cat_cols].astype("category")

    return df, cat_cols

def reduce_mem_usage(df):
    """This method reduces memory for numeric columns in the dataframe"""
    numerics = [
        "int16",
        "int32",
        "int64",
        "float16",
        "float32",
        "float64",
        "uint16",
        "uint32",
        "uint64",
    ]
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if "int" in str(col_type):
                if (
                    c_min >= np.iinfo(np.int8).min
                    and c_max < np.iinfo(np.int8).max
                ):
                    df[col] = df[col].astype(np.int8)
                elif (
                    c_min >= np.iinfo(np.int16).min
                    and c_max < np.iinfo(np.int16).max
                ):
                    df[col] = df[col].astype(np.int16)
                elif (
                    c_min >= np.iinfo(np.int32).min
                    and c_max < np.iinfo(np.int32).max
                ):
                    df[col] = df[col].astype(np.int32)
                elif (
                    c_min >= np.iinfo(np.int64).min
                    and c_max < np.iinfo(np.int64).max
                ):
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min >= np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                if (
                    c_min >= np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    print(f"start - end memory:- {start_mem:5.2f} - {end_mem:5.2f} Mb")

    return df

### Feature engineering

In [9]:
def feature_eng(base, depth_0, depth_1, depth_2):
  base_df = (
    base.with_columns(
      month_decision=pl.col("date_decision").dt.month(),
      weekday_decision=pl.col("date_decision").dt.weekday(),
    )
  )

  # 全てのdfをleft join
  for i, df in enumerate(depth_0+depth_1+depth_2):
    base_df = base_df.join(df, how="left", on="case_id", suffix=f"_{i}")

  base_df = base_df.pipe(Pipeline.handle_dates)

  return base_df

In [11]:
train_df = feature_eng(**data_store)

del data_store
gc.collect()


train_df = train_df.pipe(Pipeline.filter_cols)
train_df, cat_cols = to_pandas(train_df)
train_df = reduce_mem_usage(train_df)

print(f"train data shape: {train_df.shape}")

start - end memory:- 4073.77 - 2099.53 Mb
train data shape: (1526659, 450)


In [12]:
nums = train_df.select_dtypes(exclude="category").columns
nan_df = train_df[nums].isna()

nan_groups = {}
for col in nums:
    cur_group = nan_df[col].sum()
    try:
        nan_groups[cur_group].append(col)
    except:
        nan_groups[cur_group] = [col]

del nan_df
gc.collect()

def reduce_group(grps):
    """
    Reduce the number of columns in each group by selecting the column with the highest number of unique values

    Args:
        grps: list of list of str
    Returns:
        list of str
    """
    use = []
    for g in grps:
        mx = 0
        vx = g[0]
        for gg in g:
            n = train_df[gg].nunique()
            if n > mx:
                mx = n
                vx = gg
        use.append(vx)

    return use

def group_columns_by_correlation(matrix, threshold=0.8):
    """
    Group columns that are highly correlated

    Args:
        - matrix: pd.DataFrame
        - threshold: float
    Returns:
        list of list of str
    """
    correlation_matrix = matrix.corr()
    groups = []
    remaining_cols = list(matrix.columns)
    while remaining_cols:
        col = remaining_cols.pop(0)
        group = [col]
        correlated_cols = [col]
        for c in remaining_cols:
            if correlation_matrix.loc[col, c] > threshold:
                group.append(c)
                correlated_cols.append(c)
        groups.append(group)
        remaining_cols = [c for c in remaining_cols if c not in correlated_cols]

    return groups

uses = []
for k, v in nan_groups.items():
    if len(v) > 1:
        Vs = nan_groups[k]
        grps = group_columns_by_correlation(train_df[Vs], threshold=0.8)
        use = reduce_group(grps)
        uses += use
    else:
        uses += v

uses += list(train_df.select_dtypes("category").columns)
train_df = train_df[uses]

In [13]:
pd.set_option('display.max_columns', 500)
train_df

Unnamed: 0,case_id,WEEK_NUM,target,month_decision,weekday_decision,credamount_770A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_867L,clientscnt_1022L,clientscnt_100L,clientscnt_1071L,clientscnt_1130L,clientscnt_157L,clientscnt_257L,clientscnt_304L,clientscnt_360L,clientscnt_493L,clientscnt_533L,clientscnt_887L,clientscnt_946L,deferredmnthsnum_166L,disbursedcredamount_1113A,downpmt_116A,homephncnt_628L,isbidproduct_1095L,mobilephncnt_593L,numactivecreds_622L,numactivecredschannel_414L,numactiverelcontr_750L,numcontrs3months_479L,numnotactivated_1143L,numpmtchanneldd_318L,numrejects9m_859L,sellerplacecnt_915L,max_mainoccupationinc_384A,max_birth_259D,max_num_group1_9,birthdate_574D,dateofbirth_337D,days180_256L,days30_165L,days360_512L,firstquarter_103L,fourthquarter_440L,secondquarter_766L,thirdquarter_1082L,max_debtoutstand_525A,max_debtoverdue_47A,max_refreshdate_3813885D,pmtscount_423L,pmtssum_45A,responsedate_1012D,responsedate_4527233D,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,numinstlswithdpd5_4187116L,annuitynextmonth_57A,currdebt_22A,currdebtcredtyperange_828A,numinstls_657L,totalsettled_863A,mindbddpdlast24m_3658935P,avgdbddpdlast3m_4187120P,mindbdtollast24m_4525191P,avgdpdtolclosure24_3658938P,avginstallast24m_3658937A,maxinstallast24m_3658928A,avgmaxdpdlast9m_3716943P,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,cntincpaycont9m_3716944L,cntpmts24_3658933L,commnoinclast6m_3546845L,maxdpdfrom6mto36m_3546853P,datefirstoffer_1144D,datelastunpaid_3546854D,daysoverduetolerancedd_3976961L,numinsttopaygr_769L,dtlastpmtallstes_4499206D,eir_270L,firstclxcampaign_1125D,firstdatedue_489D,lastactivateddate_801D,lastapplicationdate_877D,max_num_group1,max_num_group2_14,last_num_group1_14,last_num_group2_14,lastapprcredamount_781A,lastapprdate_640D,lastdelinqdate_224D,lastrejectcredamount_222A,lastrejectdate_50D,maininc_215A,mastercontrelectronic_519L,mastercontrexist_109L,maxannuity_159A,maxdebt4_972A,maxdpdlast24m_143P,maxdpdlast3m_392P,maxdpdtolerance_374P,maxdbddpdlast1m_3658939P,maxdbddpdtollast12m_3658940P,maxdbddpdtollast6m_4187119P,maxdpdinstldate_3546855D,maxdpdinstlnum_3546846P,maxlnamtstart6m_4525199A,maxoutstandbalancel12m_4187113A,numinstpaidearly_338L,numinstpaidearly5d_1087L,numinstpaidlate1d_3546852L,numincomingpmts_3546848L,numinstlsallpaid_934L,numinstlswithdpd10_728L,numinstlswithoutdpd_562L,numinstpaid_4499208L,numinstpaidearly3d_3546850L,numinstregularpaidest_4493210L,numinstpaidearly5dest_4493211L,sumoutstandtotalest_4493215A,numinstpaidlastcontr_4325080L,numinstregularpaid_973L,pctinstlsallpaidearl3d_427L,pctinstlsallpaidlate1d_3546856L,pctinstlsallpaidlat10d_839L,pctinstlsallpaidlate4d_3546849L,pctinstlsallpaidlate6d_3546844L,pmtnum_254L,posfpd10lastmonth_333P,posfpd30lastmonth_3976960P,posfstqpd30lastmonth_3976962P,price_1097A,sumoutstandtotal_3546847A,totaldebt_9A,mean_actualdpd_943P,max_annuity_853A,mean_annuity_853A,max_credacc_credlmt_575A,max_credamount_590A,max_downpmt_134A,mean_credacc_credlmt_575A,mean_credamount_590A,mean_downpmt_134A,max_currdebt_94A,mean_currdebt_94A,max_mainoccupationinc_437A,mean_mainoccupationinc_437A,mean_maxdpdtolerance_577P,max_outstandingdebt_522A,mean_outstandingdebt_522A,last_actualdpd_943P,last_annuity_853A,last_credacc_credlmt_575A,last_credamount_590A,last_downpmt_134A,last_currdebt_94A,last_mainoccupationinc_437A,last_maxdpdtolerance_577P,last_outstandingdebt_522A,max_approvaldate_319D,max_dateactivated_425D,max_dtlastpmt_581D,max_dtlastpmtallstes_3545839D,max_employedfrom_700D,max_firstnonzeroinstldate_307D,last_approvaldate_319D,last_creationdate_885D,last_dateactivated_425D,last_employedfrom_700D,last_firstnonzeroinstldate_307D,max_byoccupationinc_3656910L,max_childnum_21L,max_pmtnum_8L,last_childnum_21L,last_pmtnum_8L,max_pmtamount_36A,last_pmtamount_36A,max_processingdate_168D,last_processingdate_168D,max_num_group1_5,last_num_group1_5,mean_credlmt_230A,mean_credlmt_935A,mean_pmts_dpd_1073P,max_dpdmaxdatemonth_89T,max_dpdmaxdateyear_596T,max_pmts_dpd_303P,mean_dpdmax_757P,max_dpdmaxdatemonth_442T,max_dpdmaxdateyear_896T,mean_pmts_dpd_303P,mean_instlamount_768A,mean_monthlyinstlamount_332A,max_monthlyinstlamount_674A,mean_monthlyinstlamount_674A,mean_outstandingamount_354A,mean_outstandingamount_362A,mean_overdueamount_31A,mean_overdueamount_659A,max_numberofoverdueinstls_725L,mean_overdueamountmax2_14A,mean_totaloutstanddebtvalue_39A,max_dateofcredend_289D,max_dateofcredstart_739D,max_lastupdate_1112D,max_numberofcontrsvalue_258L,max_numberofoverdueinstlmax_1039L,max_overdueamountmaxdatemonth_365T,max_overdueamountmaxdateyear_2T,mean_pmts_overdue_1140A,max_pmts_month_158T,max_pmts_year_1139T,mean_overdueamountmax2_398A,max_dateofcredend_353D,max_dateofcredstart_181D,max_numberofoverdueinstlmax_1151L,mean_overdueamountmax_35A,max_overdueamountmaxdatemonth_284T,max_overdueamountmaxdateyear_994T,mean_pmts_overdue_1152A,max_residualamount_488A,mean_residualamount_856A,max_totalamount_6A,mean_totalamount_6A,mean_totalamount_996A,mean_totaldebtoverduevalue_718A,mean_totaloutstanddebtvalue_668A,max_numberofcontrsvalue_358L,max_dateofrealrepmt_138D,max_lastupdate_388D,max_numberofoverdueinstlmaxdat_148D,max_numberofoverdueinstlmaxdat_641D,max_overdueamountmax2date_1002D,max_overdueamountmax2date_1142D,last_refreshdate_3813885D,max_nominalrate_281L,max_nominalrate_498L,max_numberofinstls_229L,max_numberofinstls_320L,max_numberofoutstandinstls_520L,max_numberofoutstandinstls_59L,max_numberofoverdueinstls_834L,max_periodicityofpmts_1102L,max_periodicityofpmts_837L,max_num_group1_6,last_mainoccupationinc_384A,last_birth_259D,max_empl_employedfrom_271D,last_personindex_1023L,last_persontype_1072L,max_collater_valueofguarantee_1124L,max_collater_valueofguarantee_876L,max_pmts_month_706T,max_pmts_year_507T,last_pmts_month_158T,last_pmts_year_1139T,last_pmts_month_706T,last_pmts_year_507T,max_num_group1_13,max_num_group2_13,last_num_group2_13,max_num_group1_15,max_num_group2_15,description_5085714M,education_1103M,education_88M,maritalst_385M,maritalst_893M,requesttype_4525192L,credtype_322L,disbursementtype_67L,inittransactioncode_186L,lastapprcommoditycat_1041M,lastcancelreason_561M,lastrejectcommoditycat_161M,lastrejectcommodtypec_5251769M,lastrejectreason_759M,lastrejectreasonclient_4145040M,lastst_736L,opencred_647L,paytype1st_925L,paytype_783L,twobodfilling_608L,max_cancelreason_3545846M,max_education_1138M,max_postype_4733339M,max_rejectreason_755M,max_rejectreasonclient_4145042M,last_cancelreason_3545846M,last_education_1138M,last_postype_4733339M,last_rejectreason_755M,last_rejectreasonclient_4145042M,max_credtype_587L,max_familystate_726L,max_inittransactioncode_279L,max_isbidproduct_390L,max_status_219L,last_credtype_587L,last_familystate_726L,last_inittransactioncode_279L,last_isbidproduct_390L,last_status_219L,max_classificationofcontr_13M,max_classificationofcontr_400M,max_contractst_545M,max_contractst_964M,max_description_351M,max_financialinstitution_382M,max_financialinstitution_591M,max_purposeofcred_426M,max_purposeofcred_874M,max_subjectrole_182M,max_subjectrole_93M,last_classificationofcontr_13M,last_classificationofcontr_400M,last_contractst_545M,last_contractst_964M,last_description_351M,last_financialinstitution_382M,last_financialinstitution_591M,last_purposeofcred_426M,last_purposeofcred_874M,last_subjectrole_182M,last_subjectrole_93M,max_education_927M,max_empladdr_district_926M,max_empladdr_zipcode_114M,max_language1_981M,last_education_927M,last_empladdr_district_926M,last_empladdr_zipcode_114M,last_language1_981M,max_contaddr_matchlist_1032L,max_contaddr_smempladdr_334L,max_empl_employedtotal_800L,max_empl_industry_691L,max_familystate_447L,max_incometype_1044T,max_relationshiptoclient_415T,max_relationshiptoclient_642T,max_remitter_829L,max_role_1084L,max_safeguarantyflag_411L,max_sex_738L,max_type_25L,last_contaddr_matchlist_1032L,last_contaddr_smempladdr_334L,last_incometype_1044T,last_relationshiptoclient_642T,last_role_1084L,last_safeguarantyflag_411L,last_sex_738L,last_type_25L,max_collater_typofvalofguarant_298M,max_collater_typofvalofguarant_407M,max_collaterals_typeofguarante_359M,max_collaterals_typeofguarante_669M,max_subjectroles_name_541M,max_subjectroles_name_838M,last_collater_typofvalofguarant_298M,last_collater_typofvalofguarant_407M,last_collaterals_typeofguarante_359M,last_collaterals_typeofguarante_669M,last_subjectroles_name_541M,last_subjectroles_name_838M,max_cacccardblochreas_147M,last_cacccardblochreas_147M,max_conts_type_509L,last_conts_type_509L,max_conts_role_79M,max_empls_economicalst_849M,max_empls_employer_name_740M,last_conts_role_79M,last_empls_economicalst_849M,last_empls_employer_name_740M
0,0,0,0,1,4,30000.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30000.000000,0.0,0.0,False,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10800.0,-11874,3,,,,,,,,,,,,,,,,,,,,0.000000,0.000000,0.000000,0.0,0.000000,,,,,,,,,,,,0.0,0.0,,,,,,0.449951,,,,,,,,,,,,,,,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,24.0,0.0,0.0,,,,0.000000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-475.0,,5.0,,,,,,,,,,,,,,,,,,,,CAL,GBA,CASH,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,,,OTHER,OTHER,BO,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,False,False,MORE_FIVE,OTHER,MARRIED,SALARIED_GOVT,SPOUSE,SPOUSE,False,PE,True,F,PRIMARY_MOBILE,,,,COLLEAGUE,PE,,,PHONE,,,,,,,,,,,,,,,,,,,,,,
1,1,0,0,1,4,19999.800781,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19999.800781,0.0,0.0,False,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000.0,-22435,4,,,,,,,,,,,,,,,,,,,,0.000000,0.000000,0.000000,0.0,0.000000,,,,,,,,,,,,0.0,0.0,,,,,,0.299805,,,,,,,,,,,,,,,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,18.0,0.0,0.0,,,,0.000000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-3718.0,,5.0,,,,,,,,,,,,,,,,,,,,CAL,GBA,CASH,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,,,OTHER,OTHER,BO,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,False,False,MORE_FIVE,OTHER,DIVORCED,SALARIED_GOVT,SIBLING,SIBLING,False,PE,True,M,PRIMARY_MOBILE,,,,OTHER_RELATIVE,PE,,,PHONE,,,,,,,,,,,,,,,,,,,,,,
2,2,0,0,1,5,78000.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,78000.000000,0.0,1.0,False,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14000.0,-16105,4,,,,,,,,,,,,,,,,,,,,0.000000,0.000000,0.000000,0.0,0.000000,,,,,,,,,,,,,,,,,,,0.449951,,,,-2102.0,1.0,1.0,1.0,0.0,,,,10000.000000,-2102.0,,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,36.0,0.0,0.0,,,,0.000000,0.0,1682.400024,1161.300049,0.0,16000.000000,0.0,0.0,13000.000000,0.0,,,8200.0,8200.000000,,,,0.0,1682.400024,0.0,16000.000000,0.0,,8200.0,,,,,,,-3244.0,-2072.0,,-2102.0,,-3244.0,-2072.0,,0.0,24.0,0.0,12.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-3244.0,,4.0,,,,,,,,,,,,,,,,,,,,CAL,GBA,CASH,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,D,False,OTHER,OTHER,BO,a55475b1,P97_36_170,a55475b1,a55475b1,a55475b1,a55475b1,P97_36_170,a55475b1,a55475b1,a55475b1,CAL,SINGLE,CASH,False,D,CAL,SINGLE,CASH,False,D,,,,,,,,,,,,,,,,,,,,,,,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,False,False,MORE_FIVE,OTHER,MARRIED,EMPLOYED,SPOUSE,SPOUSE,False,PE,True,F,PRIMARY_MOBILE,,,,SPOUSE,PE,,,PHONE,,,,,,,,,,,,,,,PRIMARY_MOBILE,PRIMARY_MOBILE,,,,,,
3,3,0,0,1,4,40000.000000,0.0,1.0,0.0,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40000.000000,0.0,0.0,False,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,10000.0,-9286,2,,,,,,,,,,,,,,,,,,,,0.000000,0.000000,0.000000,0.0,0.000000,,,,,,,,,,,,0.0,0.0,,,,,,0.419922,,,,4.0,0.0,2.0,0.0,0.0,,,,59999.800781,4.0,,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,12.0,0.0,0.0,,,,0.000000,0.0,6140.000000,6140.000000,0.0,59999.800781,0.0,0.0,59999.800781,0.0,,,11000.0,11000.000000,,,,0.0,6140.000000,0.0,59999.800781,0.0,,11000.0,,,,,,,-233.0,35.0,,4.0,,-233.0,35.0,,,12.0,,12.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-233.0,,4.0,,,,,,,,,,,,,,,,,,,,CAL,GBA,CASH,a55475b1,P94_109_143,a55475b1,a55475b1,P94_109_143,a55475b1,D,False,OTHER,OTHER,BO,P94_109_143,P97_36_170,a55475b1,P94_109_143,a55475b1,P94_109_143,P97_36_170,a55475b1,P94_109_143,a55475b1,CAL,MARRIED,CASH,False,D,CAL,MARRIED,CASH,False,D,,,,,,,,,,,,,,,,,,,,,,,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,False,False,MORE_FIVE,OTHER,MARRIED,EMPLOYED,SPOUSE,SPOUSE,False,PE,True,F,PRIMARY_MOBILE,,,,SPOUSE,PE,,,PHONE,,,,,,,,,,,,,,,PRIMARY_MOBILE,PHONE,,,,,,
4,4,0,1,1,5,44000.000000,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,44000.000000,0.0,1.0,False,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24000.0,-9134,3,,,,,,,,,,,,,,,,,,,,0.000000,0.000000,0.000000,0.0,0.000000,,,,,,,,,,,,0.0,0.0,,,,,,0.449951,,,,4.0,0.0,1.0,0.0,0.0,,,,,,,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,24.0,0.0,0.0,,,,0.000000,0.0,2556.600098,2556.600098,0.0,40000.000000,0.0,0.0,40000.000000,0.0,,,16000.0,16000.000000,,,,0.0,2556.600098,0.0,40000.000000,0.0,,16000.0,,,,,,,,35.0,,4.0,,,35.0,,,24.0,,24.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-1481.0,,5.0,,,,,,,,,,,,,,,,,,,,CAL,GBA,CASH,a55475b1,P24_27_36,a55475b1,a55475b1,a55475b1,a55475b1,T,False,OTHER,OTHER,BO,P24_27_36,a55475b1,a55475b1,a55475b1,a55475b1,P24_27_36,a55475b1,a55475b1,a55475b1,a55475b1,CAL,,CASH,False,T,CAL,,CASH,False,T,,,,,,,,,,,,,,,,,,,,,,,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,False,False,MORE_FIVE,OTHER,MARRIED,EMPLOYED,SIBLING,SIBLING,False,PE,True,F,PRIMARY_MOBILE,,,,SIBLING,PE,,,PHONE,,,,,,,,,,,,,,,PRIMARY_MOBILE,PRIMARY_MOBILE,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1526654,2703450,91,0,10,1,30000.000000,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30000.000000,0.0,0.0,True,3.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,40000.0,-22193,0,,-22192.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,10572.717773,0.0,14.0,,,,,0.0,176561.359375,1.0,0.000000,0.000000,0.000000,0.0,428159.656250,-144.0,-43.0,-144.0,0.0,7356.800293,46718.199219,0.0,16392.496094,6750.200195,15.0,24.0,0.0,0.0,-2784.0,-1481.0,8.0,0.0,,0.419922,-1610.0,-4832.0,-355.0,-362.0,12.0,2.0,11.0,0.0,20020.0,-362.0,-1481.0,150000.000000,-1040.0,36000.0,0.0,0.0,75521.906250,105019.789062,0.0,0.0,8.0,0.0,0.0,0.0,-1512.0,6.0,49651.402344,77533.757812,99.0,11.0,4.0,92.0,112.0,0.0,117.0,113.0,103.0,113.0,11.0,0.000000,12.0,113.0,0.911621,0.035400,0.026550,0.035400,0.035400,12.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,30875.000000,4890.250000,0.0,150000.000000,0.0,0.0,33377.500000,0.0,0.000000,0.000000,50000.0,23116.666016,0.545410,0.000000,0.000000,0.0,1267.800049,0.0,6380.000000,0.0,0.0,2400.0,0.0,0.0,-362.0,-355.0,3.0,3.0,-2090.0,-332.0,-4864.0,-4864.0,-4856.0,-8088.0,-4832.0,1.0,2.0,24.0,0.0,6.0,,,,,,,20600.0,,0.000000,1.0,2019.0,44.0,6.636364,12.0,2019.0,0.789551,,3773.600098,34925.253906,5634.094727,0.0,10572.717773,0.0,0.0,0.0,0.00000,10572.717773,94.0,-637.0,3.0,1.0,0.0,1.0,2019.0,0.000000,12.0,2021.0,430.945465,-27.0,-362.0,49.0,399.349091,12.0,2019.0,69.693535,0.0,,77607.804688,25252.519531,60602.0,0.0,0.0,11.0,-26.0,-26.0,-1482.0,,-1482.0,,,42.00000,45.0,20.0,24.0,0.0,3.0,0.0,30.0,30.0,11.0,40000.0,-22192.0,,0.0,1.0,0.0,0.0,12.0,2021.0,,,1.0,2021.0,10.0,35.0,23.0,0.0,0.0,2fc785b2,a55475b1,a55475b1,a55475b1,a55475b1,,CAL,GBA,CASH,P12_6_178,a55475b1,a55475b1,a55475b1,P94_109_143,P94_109_143,K,False,OTHER,OTHER,FO,a55475b1,a55475b1,P60_146_156,a55475b1,a55475b1,a55475b1,P97_36_170,P149_40_170,a55475b1,a55475b1,REL,MARRIED,POS,True,K,COL,MARRIED,POS,False,K,ea6782cc,ea6782cc,a55475b1,a55475b1,a55475b1,b619fa46,a55475b1,a55475b1,a55475b1,ab3c25cf,ab3c25cf,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,P209_127_106,a55475b1,a55475b1,a55475b1,P209_127_106,False,False,,,,RETIRED_PENSIONER,,,,CL,True,F,PRIMARY_MOBILE,False,False,RETIRED_PENSIONER,,CL,True,F,PRIMARY_MOBILE,a55475b1,a55475b1,c7a5ad39,c7a5ad39,ab3c25cf,ab3c25cf,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,PRIMARY_MOBILE,PRIMARY_MOBILE,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1
1526655,2703451,91,0,10,1,100000.000000,0.0,0.0,5.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40739.539062,0.0,2.0,True,2.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,36800.0,-25541,1,,-25536.0,0.0,0.0,0.0,1.0,0.0,1.0,2.0,64921.707031,0.0,14.0,,,,,0.0,301276.468750,0.0,6191.600098,68098.398438,68098.398438,24.0,701247.312500,-92.0,-12.0,-92.0,0.0,12553.200195,40499.800781,0.0,105129.312500,15780.400391,17.0,21.0,0.0,0.0,-1679.0,-128.0,2.0,11.0,,0.399902,-1679.0,-1876.0,-364.0,-371.0,5.0,3.0,1.0,2.0,0.0,-371.0,-128.0,,,,0.0,0.0,117251.601562,202775.546875,2.0,0.0,2.0,0.0,2.0,2.0,-128.0,7.0,116813.398438,250031.203125,70.0,0.0,1.0,69.0,73.0,0.0,76.0,75.0,70.0,75.0,0.0,68098.398438,0.0,75.0,0.945801,0.013512,0.000000,0.000000,0.000000,24.0,0.0,0.0,0.0,,68098.398438,68098.398438,0.0,12809.200195,6472.866699,0.0,114000.000000,0.0,0.0,74740.429688,0.0,59773.714844,9962.286133,50000.0,38960.000000,0.166626,68098.398438,11349.733398,0.0,5819.399902,0.0,60000.000000,0.0,0.0,24000.0,0.0,0.0,-371.0,-364.0,-7.0,8.0,,-341.0,-1907.0,-1907.0,-1907.0,,-1876.0,10340.0,0.0,24.0,0.0,18.0,,,,,,,,,0.000000,10.0,2019.0,0.0,0.000000,8.0,2019.0,0.000000,,6191.600098,34308.203125,20823.824219,0.0,64921.707031,0.0,0.0,0.0,0.00000,64921.707031,360.0,-371.0,3.0,1.0,0.0,10.0,2019.0,0.000000,12.0,2021.0,0.000000,-7.0,-616.0,0.0,0.000000,8.0,2019.0,0.000000,,,121868.601562,94845.796875,113980.0,0.0,0.0,4.0,-6.0,-6.0,,,,,-49.0,26.90625,45.0,21.0,24.0,0.0,12.0,0.0,30.0,30.0,8.0,,,,,1.0,0.0,0.0,12.0,2021.0,,,1.0,2021.0,3.0,23.0,23.0,0.0,0.0,2fc785b2,a55475b1,a55475b1,a55475b1,a55475b1,,CAL,GBA,CASH,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,A,False,OTHER,OTHER,FO,a55475b1,a55475b1,P46_145_78,a55475b1,a55475b1,a55475b1,P97_36_170,P177_117_192,a55475b1,a55475b1,REL,MARRIED,NDF,True,K,CAL,MARRIED,CASH,False,K,ea6782cc,ea6782cc,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,ab3c25cf,ab3c25cf,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,False,False,,,,RETIRED_PENSIONER,,,,CL,True,F,PRIMARY_MOBILE,,,,,CL,,,HOME_PHONE,a55475b1,a55475b1,c7a5ad39,c7a5ad39,ab3c25cf,ab3c25cf,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,PRIMARY_MOBILE,,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1
1526656,2703452,91,0,10,1,60000.000000,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60000.000000,0.0,0.0,True,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,30000.0,-15771,0,,-15768.0,2.0,0.0,3.0,0.0,1.0,0.0,4.0,20547.751953,0.0,14.0,,,,,0.0,14232.400391,0.0,0.000000,0.000000,0.000000,0.0,24002.000000,-27.0,,-55.0,1.0,2662.400146,3243.400146,,,1500.599976,0.0,7.0,0.0,4.0,,-569.0,4.0,0.0,-350.0,0.419922,,-720.0,-409.0,-45.0,2.0,1.0,1.0,1.0,3998.0,-413.0,-569.0,,,24000.0,0.0,0.0,6600.000000,17143.400391,4.0,0.0,4.0,,-27.0,,-597.0,5.0,4182.000000,0.000000,6.0,3.0,3.0,9.0,6.0,0.0,8.0,9.0,6.0,9.0,3.0,0.000000,3.0,9.0,0.666504,0.333252,0.000000,0.111084,0.000000,11.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,9048.000000,4597.466797,0.0,80000.000000,0.0,0.0,33232.000000,0.0,0.000000,0.000000,34000.0,29333.333984,0.500000,0.000000,0.000000,0.0,3243.400146,0.0,15698.000000,0.0,0.0,30000.0,1.0,0.0,-413.0,-409.0,-350.0,-350.0,-977.0,-14.0,-750.0,-750.0,-742.0,-977.0,-720.0,,,14.0,,6.0,,,,,,,4000.0,26500.0,0.444336,11.0,2020.0,0.0,0.000000,10.0,2019.0,0.000000,2248.766113,2248.766113,3242.800049,1721.000000,0.0,,0.0,0.0,0.0,2442.14917,20547.751953,153.0,-336.0,3.0,2.0,17.0,11.0,2020.0,135.674957,12.0,2021.0,0.000000,-321.0,-413.0,0.0,0.000000,10.0,2019.0,0.000000,0.0,10273.875977,17143.400391,10662.700195,,0.0,0.0,3.0,-321.0,-321.0,,-145.0,,-145.0,-49.0,,45.0,6.0,,0.0,,0.0,30.0,,8.0,30000.0,-15768.0,,0.0,1.0,0.0,0.0,12.0,2020.0,,,1.0,2020.0,2.0,35.0,11.0,0.0,0.0,2fc785b2,a55475b1,a55475b1,a55475b1,a55475b1,,CAL,GBA,CASH,P159_130_59,P180_60_137,a55475b1,a55475b1,a55475b1,a55475b1,T,False,OTHER,OTHER,BO,a55475b1,a55475b1,P67_102_161,a55475b1,a55475b1,a55475b1,P97_36_170,P177_117_192,a55475b1,a55475b1,COL,MARRIED,POS,True,T,COL,MARRIED,POS,False,K,ea6782cc,ea6782cc,a55475b1,a55475b1,a55475b1,b619fa46,b619fa46,a55475b1,a55475b1,ab3c25cf,ab3c25cf,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,P209_127_106,a55475b1,a55475b1,a55475b1,P209_127_106,False,False,,,,PRIVATE_SECTOR_EMPLOYEE,,,,CL,False,M,PRIMARY_MOBILE,False,False,PRIVATE_SECTOR_EMPLOYEE,,CL,False,M,PRIMARY_MOBILE,a55475b1,a55475b1,c7a5ad39,c7a5ad39,ab3c25cf,ab3c25cf,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,PRIMARY_MOBILE,,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1
1526657,2703453,91,0,10,1,6000.000000,0.0,0.0,36.0,0.0,9.0,1.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,6000.000000,0.0,1.0,True,2.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,30000.0,-25814,1,,-25808.0,2.0,1.0,4.0,1.0,3.0,2.0,1.0,42536.660156,0.0,14.0,,,,,0.0,197371.578125,9.0,2827.199951,46806.601562,46806.601562,30.0,440145.312500,-68.0,-64.0,-68.0,0.0,8212.600586,88740.804688,0.0,47943.062500,9921.200195,7.0,23.0,0.0,2.0,-2350.0,-994.0,38.0,17.0,,0.419922,-1202.0,-2624.0,-287.0,-292.0,8.0,3.0,0.0,0.0,0.0,-292.0,-994.0,2198.000000,-2656.0,,0.0,0.0,163202.000000,126780.000000,0.0,0.0,34.0,-66.0,0.0,-33.0,-1878.0,13.0,94265.203125,81604.601562,61.0,12.0,23.0,119.0,89.0,7.0,109.0,119.0,78.0,119.0,12.0,46806.601562,13.0,115.0,0.696289,0.205322,0.043488,0.108093,0.099121,6.0,0.0,0.0,0.0,0.0,46806.601562,46806.601562,0.0,5981.399902,1790.355591,0.0,123800.000000,0.0,0.0,28521.111328,0.0,34550.855469,3838.983887,76000.0,35625.000000,5.000000,46806.601562,5200.733398,0.0,416.200012,0.0,2198.000000,0.0,0.0,7000.0,,0.0,-292.0,-287.0,-293.0,4.0,,-261.0,,-2656.0,,,-2624.0,33059.0,0.0,48.0,0.0,6.0,,,,,,,0.0,,0.000000,1.0,2020.0,23.0,2.692308,12.0,2018.0,0.357910,,2827.199951,82759.375000,7102.968750,0.0,42536.660156,0.0,0.0,0.0,0.00000,42536.660156,621.0,-292.0,3.0,1.0,0.0,1.0,2020.0,0.000000,12.0,2021.0,766.815369,-293.0,-691.0,37.0,275.692322,12.0,2018.0,123.303925,0.0,,126780.000000,29781.183594,51996.0,0.0,0.0,13.0,-293.0,-293.0,-1840.0,,-1846.0,,,42.00000,45.0,48.0,30.0,0.0,21.0,0.0,30.0,30.0,13.0,,,,,1.0,0.0,0.0,12.0,2020.0,,,1.0,2020.0,12.0,35.0,23.0,0.0,0.0,2fc785b2,6b2ae0fa,a55475b1,3439d993,a55475b1,,CAL,GBA,CASH,a55475b1,a55475b1,P159_130_59,P174_113_42,a55475b1,a55475b1,A,False,OTHER,OTHER,BO,a55475b1,a55475b1,P46_145_78,a55475b1,a55475b1,a55475b1,P97_36_170,P177_117_192,a55475b1,a55475b1,REL,MARRIED,POS,False,K,COL,MARRIED,POS,False,D,ea6782cc,ea6782cc,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,ab3c25cf,ab3c25cf,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,False,False,,,,RETIRED_PENSIONER,,,,CL,False,F,PRIMARY_MOBILE,,,,,CL,,,HOME_PHONE,a55475b1,a55475b1,c7a5ad39,c7a5ad39,ab3c25cf,ab3c25cf,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,PRIMARY_MOBILE,PRIMARY_MOBILE,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1


### Train

In [14]:
y = train_df["target"]
weeks = train_df["WEEK_NUM"]

train_df = train_df.drop(columns=["target", "case_id", "WEEK_NUM"])
cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

In [15]:
train_df[cat_cols] = train_df[cat_cols].astype(str)
# test_df[cat_cols] = test_df[cat_cols].astype(str)

In [16]:
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 10,
    "learning_rate": 0.05,
    "n_estimators": 2000,
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "verbose": -1,
    "random_state": 42,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "extra_trees":True,
    'num_leaves':64,
    "device": device,
    "verbose": -1,
}

In [17]:
params_2 = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 8,
    "learning_rate": 0.03,
    "n_estimators": 2000,
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "verbose": -1,
    "random_state": 42,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "extra_trees":True,
    'num_leaves':50,
    "device": device,
    "verbose": -1,
}

In [18]:
%%time

fitted_models_cat = []
fitted_models_lgb = []

cv_scores_cat = []
cv_scores_lgb = []

iterator = 0
for idx_train, idx_valid in cv.split(train_df, y, groups=weeks):
    X_train, y_train = train_df.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = train_df.iloc[idx_valid], y.iloc[idx_valid]

    #######
    # cat #
    #######
    train_pool = Pool(X_train, y_train, cat_features=cat_cols)
    val_pool = Pool(X_valid, y_valid, cat_features=cat_cols)

    clf = CatBoostClassifier(
        eval_metric="AUC",
        task_type="GPU",
        learning_rate=0.03,
        iterations=6000,
    )

    clf.fit(train_pool, eval_set=val_pool, verbose=300)
    fitted_models_cat.append(clf)
    y_pred_valid = clf.predict_proba(X_valid)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores_cat.append(auc_score)

    #######
    # lgb #
    #######
    X_train[cat_cols] = X_train[cat_cols].astype("category")
    X_valid[cat_cols] = X_valid[cat_cols].astype("category")

    if iterator % 2 == 0:
        model = lgb.LGBMClassifier(**params)
    else:
        model = lgb.LGBMClassifier(**params_2)

    model.fit(
        X_train,
        y_train,
        eval_set = [(X_valid, y_valid)],
        callbacks = [lgb.log_evaluation(200), lgb.early_stopping(100)],
    )

    fitted_models_lgb.append(model)
    y_pred_valid = model.predict_proba(X_valid)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores_lgb.append(auc_score)

    iterator += 1


print("CV AUC scores: ", cv_scores_cat)
print("Maximum CV AUC score: ", max(cv_scores_cat))


print("CV AUC scores: ", cv_scores_lgb)
print("Maximum CV AUC score: ", max(cv_scores_lgb))

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6703970	best: 0.6703970 (0)	total: 192ms	remaining: 19m 11s
300:	test: 0.8427195	best: 0.8427195 (300)	total: 53.6s	remaining: 16m 55s
600:	test: 0.8487316	best: 0.8487316 (600)	total: 1m 45s	remaining: 15m 45s
900:	test: 0.8512102	best: 0.8512102 (900)	total: 2m 35s	remaining: 14m 41s
1200:	test: 0.8525793	best: 0.8525793 (1200)	total: 3m 26s	remaining: 13m 44s
1500:	test: 0.8536078	best: 0.8536078 (1500)	total: 4m 16s	remaining: 12m 49s
1800:	test: 0.8545480	best: 0.8545480 (1800)	total: 5m 7s	remaining: 11m 56s
2100:	test: 0.8552999	best: 0.8552999 (2100)	total: 5m 57s	remaining: 11m 3s
2400:	test: 0.8558693	best: 0.8558695 (2395)	total: 6m 47s	remaining: 10m 11s
2700:	test: 0.8563094	best: 0.8563094 (2700)	total: 7m 38s	remaining: 9m 19s
3000:	test: 0.8566366	best: 0.8566366 (3000)	total: 8m 28s	remaining: 8m 28s
3300:	test: 0.8570367	best: 0.8570367 (3300)	total: 9m 19s	remaining: 7m 37s
3600:	test: 0.8573840	best: 0.8573897 (3590)	total: 10m 9s	remaining: 6m 46s
3900:	

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.857446
[400]	valid_0's auc: 0.861689
[600]	valid_0's auc: 0.862849
[800]	valid_0's auc: 0.86312
Early stopping, best iteration is:
[739]	valid_0's auc: 0.863229


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6616252	best: 0.6616252 (0)	total: 186ms	remaining: 18m 34s
300:	test: 0.8303384	best: 0.8303384 (300)	total: 52.5s	remaining: 16m 33s
600:	test: 0.8373157	best: 0.8373157 (600)	total: 1m 43s	remaining: 15m 29s
900:	test: 0.8402025	best: 0.8402025 (900)	total: 2m 33s	remaining: 14m 29s
1200:	test: 0.8419309	best: 0.8419309 (1200)	total: 3m 23s	remaining: 13m 33s
1500:	test: 0.8430259	best: 0.8430259 (1500)	total: 4m 12s	remaining: 12m 38s
1800:	test: 0.8441120	best: 0.8441120 (1800)	total: 5m 2s	remaining: 11m 45s


KeyboardInterrupt: 

### Ensemble

In [None]:
class VotingModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimators):
        super().__init__()
        self.estimators = estimators

    def fit(self, X, y=None):
        return self

    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]

        return np.mean(y_preds, axis=0)

    def predict_proba(self, X):
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators[:5]]

        X[cat_cols] = X[cat_cols].astype("category")
        y_preds += [estimator.predict_proba(X) for estimator in self.estimators[5:]]

        return np.mean(y_preds, axis=0)

model = VotingModel(fitted_models_cat+fitted_models_lgb)

### Inference

In [None]:
data_store = {
    "base": read_file(os.path.join(TEST_DIR, "test_base.parquet")),
    "depth_0": [
        read_file(os.path.join(TEST_DIR, "test_static_cb_0.parquet")),
        read_files(os.path.join(TEST_DIR, "test_static_0_*.parquet")),
    ],
    "depth_1": [
        read_files(os.path.join(TEST_DIR, "test_applprev_1_*.parquet"), 1),
        read_file(os.path.join(TEST_DIR, "test_tax_registry_a_1.parquet"), 1),
        read_file(os.path.join(TEST_DIR, "test_tax_registry_b_1.parquet"), 1),
        read_file(os.path.join(TEST_DIR, "train_tax_registry_c_1.parquet"), 1),
        read_files(os.path.join(TEST_DIR, "train_credit_bureau_a_1_*.parquet"), 1),
        read_file(os.path.join(TEST_DIR, "train_credit_bureau_b_1.parquet"), 1),
        read_file(os.path.join(TEST_DIR, "train_other_1.parquet"), 1),
        read_file(os.path.join(TEST_DIR, "train_person_1.parquet"), 1),
        read_file(os.path.join(TEST_DIR, "train_deposit_1.parquet"), 1),
        read_file(os.path.join(TEST_DIR, "train_debitcard_1.parquet"), 1),
    ],
    "depth_2": [
        read_file(os.path.join(TEST_DIR, "train_credit_bureau_b_2.parquet"), 2),
        read_files(os.path.join(TEST_DIR, "train_credit_bureau_a_2_*.parquet"), 2),
        read_file(os.path.join(TEST_DIR, "train_applprev_2.parquet"), 2),
        read_file(os.path.join(TEST_DIR, "train_person_2.parquet"), 2)
    ],
}