In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option('display.max_columns', 100)

PROJECT_ROOT = Path.cwd().parent

APPLIC_PATH = PROJECT_ROOT / 'data/raw/application_train.csv'
PROCESSED_PATH = PROJECT_ROOT / 'data/processed/application_features_baseline.csv'

BUREAU_PATH = PROJECT_ROOT / 'data/raw/bureau.csv'
BUREAU_BALANCE_PATH = PROJECT_ROOT / 'data/raw/bureau_balance.csv'

INTERIM_PATH = PROJECT_ROOT / 'data/interim'
INTERIM_BUREAU = INTERIM_PATH / 'bureau_agg.csv'
INTERIM_INSTAL = INTERIM_PATH / 'installment_agg.csv'
INTERIM_CREDIT_CARD = INTERIM_PATH / 'credit_card.csv'

PREVIOUS_APPLIC_PATH = PROJECT_ROOT / 'data/raw/previous_application.csv'
POS_CASH_BALANCE_PATH = PROJECT_ROOT / 'data/raw/POS_CASH_balance.csv'
INSTAL_PAYMENTS_PATH = PROJECT_ROOT / 'data/raw/installments_payments.csv'
CREDIT_CARD_BALANCE_PATH = PROJECT_ROOT / 'data/raw/credit_card_balance.csv'

LAST_K = 3
YEAR = 360

In [12]:
if not APPLIC_PATH.exists() or not BUREAU_PATH.exists() or not BUREAU_BALANCE_PATH.exists():
    raise FileNotFoundError(
        "Raw data not found. See README.md for download insturctions."
    )

df = pd.read_csv(APPLIC_PATH)

bureau_df = pd.read_csv(BUREAU_PATH)
bureau_balance_df = pd.read_csv(BUREAU_BALANCE_PATH)


Aggregate bureau balance table and merge it with bureau table by SK_ID_BUREAU key

In [13]:
bureau_balance_agg = bureau_balance_df.groupby('SK_ID_BUREAU').agg(
    loan_duration=('MONTHS_BALANCE', 'count'),
    prepaid_ratio=('STATUS', lambda x: (x == 'C').mean()),
    default_ever=('STATUS', lambda x: int((x == '5').any())),
    bad_dpd_ratio=('STATUS', lambda x: (x.isin(['2', '3', '4', '5']).mean())),
    bad_dpd_count=('STATUS', lambda x: (x.isin(['2', '3', '4', '5']).sum())),
    small_dpd_ratio=('STATUS', lambda x: (x == '1').mean()),
    small_dpd_count=('STATUS', lambda x: (x == '1').sum()),
    paid_in_time_ratio=('STATUS', lambda x: (x == '0').mean()),
    paid_in_time_count=('STATUS', lambda x: (x == '0').sum()),
    unknown_ratio=('STATUS', lambda x: (x == 'X').mean())
).reset_index()

KeyboardInterrupt: 

In [None]:
bureau_df = bureau_df.merge(
    bureau_balance_agg, 
    how='left',
    on='SK_ID_BUREAU',
    )

Aggregate bureau table and merge it with main application table by SK_ID_CURR key. Engineering some features in bureau table before and after merging

In [None]:
bureau_df['early_closure_days'] = bureau_df['DAYS_CREDIT_ENDDATE'] - bureau_df['DAYS_ENDDATE_FACT']
bureau_df['credit_duration'] = abs(bureau_df['DAYS_CREDIT'] - bureau_df['DAYS_ENDDATE_FACT'])

bureau_df['cnt_curent_overdue'] = (
    (bureau_df['CREDIT_ACTIVE'] == 'Active') & 
    (bureau_df['CREDIT_DAY_OVERDUE'] > 0)
).astype(int)

bureau_df['overdue_days_active'] = np.where(
    bureau_df['CREDIT_ACTIVE'] == 'Active', 
    bureau_df['CREDIT_DAY_OVERDUE'],
    0
)

bureau_df['ever_overdue_flag'] = (bureau_df['AMT_CREDIT_MAX_OVERDUE'] > 0).astype(int)

bureau_df['overdue_ratio'] = np.where(
    bureau_df['AMT_CREDIT_SUM'] > 0,
    bureau_df['AMT_CREDIT_SUM_OVERDUE'] / bureau_df['AMT_CREDIT_SUM'],
    0
)

bureau_df['credit_sum_active'] = np.where(
    bureau_df['CREDIT_ACTIVE'] == 'Active',
    bureau_df['AMT_CREDIT_SUM'],
    0
)

bureau_df['annuity_active'] = np.where(
    bureau_df['CREDIT_ACTIVE'] == 'Active',
    bureau_df['AMT_ANNUITY'],
    0
)

In [None]:
bureau_agg = bureau_df.groupby('SK_ID_CURR').agg(
    sold_times=('CREDIT_ACTIVE', lambda x: x.isin(['Sold', 'Bad debt']).sum()),
    closed_ratio=('CREDIT_ACTIVE', lambda x: (x == 'Closed').mean()),
    active_credits=('CREDIT_ACTIVE', lambda x: (x == 'Active').sum()),
    first_credit_time=('DAYS_CREDIT', 'min'),
    overdue_days_mean=('CREDIT_DAY_OVERDUE', 'mean'),
    overdue_days_active_mean=('overdue_days_active', 'mean'),
    overdue_active_max=('CREDIT_DAY_OVERDUE', 'max'),
    overdue_historical_max=('AMT_CREDIT_MAX_OVERDUE', 'max'),
    loans_ever_overdue=('ever_overdue_flag', 'sum'),
    loans_overdue_ratio=('ever_overdue_flag', 'mean'),
    overdue_credits_active=('cnt_curent_overdue', 'sum'),
    overdue_ammount_active=('AMT_CREDIT_SUM_OVERDUE', 'sum'),
    overdue_ratio_max=('overdue_ratio', 'max'),
    early_closure_days_ratio=('early_closure_days', 'mean'),
    credit_duration_mean=('credit_duration', 'mean'),
    prolonged_max=('CNT_CREDIT_PROLONG', 'max'),
    prolonged_times=('CNT_CREDIT_PROLONG', 'sum'),
    credit_sum_mean=('AMT_CREDIT_SUM', 'mean'),
    active_credit_sum=('credit_sum_active', 'sum'),
    debt_max=('AMT_CREDIT_SUM_DEBT', 'max'),
    debt_mean=('AMT_CREDIT_SUM_DEBT', 'mean'),
    credit_limit_max=('AMT_CREDIT_SUM_LIMIT', 'max'),
    has_credit_card=('CREDIT_TYPE', lambda x: int((x == 'Credit card').any())),
    credit_card_cnt=('CREDIT_TYPE', lambda x: (x == 'Credit card').sum()),
    low_risk_loans=('CREDIT_TYPE', lambda x: x.isin(['Car loan', 'Mortgage', 'Loan for business development']).sum()),
    has_microloan=('CREDIT_TYPE', lambda x: int((x == 'Microloan').any())),
    consumer_credit_sum=('CREDIT_TYPE', lambda x: (x == 'Consumer credit').sum()),
    last_credit_update=('DAYS_CREDIT_UPDATE', 'max'),
    first_credit_update=('DAYS_CREDIT_UPDATE', 'min'),
    current_annuity=('annuity_active', 'sum'),
    annuity_mean=('AMT_ANNUITY', 'mean'),
    loan_duration_avg=('loan_duration', 'mean'),
    loan_duration_max=('loan_duration', 'max'),
    prepaid_ratio_avg=('prepaid_ratio', 'mean'),
    defaults=('default_ever', 'sum'),
    worst_dpd=('bad_dpd_ratio', 'max'),
    bad_dpd_avg=('bad_dpd_ratio', 'mean'),
    bad_dpd_cnt=('bad_dpd_count', 'sum'),
    bad_dpd_times_max=('bad_dpd_count', 'max'),
    bigest_small_dpd=('small_dpd_ratio', 'max'),
    small_dpd_avg=('small_dpd_ratio', 'mean'),
    small_dpd_cnt=('small_dpd_count', 'sum'),
    paid_in_time_avg=('paid_in_time_ratio', 'mean'),
    paid_in_time_cnt=('paid_in_time_count', 'sum'),
    unkown_ratio_avg=('unknown_ratio', 'mean')
).reset_index()

In [None]:
bureau_agg['debt_ratio_mean'] = np.where(
   bureau_agg['credit_sum_mean'] > 0,
   bureau_agg['debt_mean'] / bureau_agg['credit_sum_mean'],
   0
)

Save aggregated table to interim folder to convinient reuse it in future

In [None]:
bureau_agg.to_csv(
    INTERIM_BUREAU,
    index=False
)

Load aggregated table from interim folder to save time due to long aggregation.

In [None]:
bureau_agg = pd.read_csv(INTERIM_BUREAU)

In [None]:
df = df.merge(
    bureau_agg,
    how='left',
    on='SK_ID_CURR'
)

Stop of bureau feature engineering

---

In [None]:
#365243 means NaN in dataset, we add DAYS_EMPLOYED_MISSING as borrowers whith NaN values have lower default rate
df['DAYS_EMPLOYED'] = df['DAYS_EMPLOYED'].replace(365243, np.nan)

df['days_employed_missing'] = df['DAYS_EMPLOYED'].isna().astype(int)
df['DAYS_EMPLOYED'] = df['DAYS_EMPLOYED'].fillna(-1)

In [None]:
df['credit_income_ratio'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
df['annuity_income_ratio'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']

In [None]:
# smoothing skewed peaks of data
df['log_income'] = np.log1p(df['AMT_INCOME_TOTAL'])
df['log_credit'] = np.log1p(df['AMT_CREDIT'])

In [None]:
df['has_many_children'] = (df['CNT_CHILDREN'] >= 3).astype(int)

# include after baseline
# df['cnt_children_capped'] = df['CNT_CHILDREN'].clip(upper=3) 

In [None]:
df['ext_source_1_missing'] = df['EXT_SOURCE_1'].isna().astype(int)
df['ext_source_3_missing'] = df['EXT_SOURCE_3'].isna().astype(int)

df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].fillna(-1)

In [None]:
cat_features = df.select_dtypes('object').apply(pd.Series.nunique, axis=0)

for num, name in zip(cat_features, cat_features.index):
    if num <= 2:
        df[name] = df[name].astype('category').cat.codes.replace(-1, np.nan)

df = pd.get_dummies(df)

replacing binary categories to codes (0, 1), all NaN preserved. One-Hot encoding for all categorical features with more than 2 classes

In [None]:
df.drop(columns='SK_ID_CURR', inplace=True)

drop useless features

In [None]:
df.to_csv(
    PROCESSED_PATH,
    index=False
    )

---

## Previous application dataset related feature engineering

#### POS_CASH_balance aggregation

In [2]:
previous_df = pd.read_csv(PREVIOUS_APPLIC_PATH)
pos_cash_df = pd.read_csv(POS_CASH_BALANCE_PATH)

In [3]:
pos_cash_df['NAME_CONTRACT_STATUS'].value_counts()

NAME_CONTRACT_STATUS
Active                   9151119
Completed                 744883
Signed                     87260
Demand                      7065
Returned to the store       5461
Approved                    4917
Amortized debt               636
Canceled                      15
XNA                            2
Name: count, dtype: int64

In [4]:
# TODO: Add prepaid ratio and mark CNT_INSTALMENT reduction

pos_cash_agg = pos_cash_df.groupby('SK_ID_PREV').agg(
    max_cnt_installment=('CNT_INSTALMENT', 'max'),
    is_completed=('NAME_CONTRACT_STATUS', lambda x: int((x == 'Completed').any())),
    sk_dpd_days=('SK_DPD', 'mean'),
    # sk_dpd_cnt=('SK_DPD', lambda x: (x > 0).sum()),
    sk_dpd_max=('SK_DPD', 'max'),
    sk_dpd_def_days=('SK_DPD_DEF', 'mean'),
    # sk_dpd_def_cnt=('SK_DPD_DEF', lambda x: (x > 0).sum()),
    sk_dpd_def_max=('SK_DPD_DEF', 'max'),
).reset_index()

In [5]:
previous_df = previous_df.merge(
    pos_cash_agg,
    how='left',
    on='SK_ID_PREV'
)

---
#### Installments payments aggregation

In [6]:
installment_df = pd.read_csv(INSTAL_PAYMENTS_PATH)

In [7]:
installment_df = installment_df.sort_values(['SK_ID_PREV', 'DAYS_INSTALMENT'])

In [12]:
installment_df = installment_df.assign(
    delay_days=installment_df['DAYS_ENTRY_PAYMENT'] - installment_df['DAYS_INSTALMENT'],
    underpay_amt=installment_df['AMT_INSTALMENT'] - installment_df['AMT_PAYMENT'],
    inst_rank_recency=installment_df.groupby('SK_ID_PREV').cumcount(ascending=False),
    payment_ratio=installment_df['AMT_PAYMENT'] / installment_df['AMT_INSTALMENT']
)

installment_df['is_recent'] = installment_df['inst_rank_recency'] < LAST_K

installment_df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [None]:
installment_agg = installment_df.groupby('SK_ID_PREV').agg(
    inst_cnt=('AMT_INSTALMENT', 'count'),

    # timing sevirity / frequency
    payments_duration=('DAYS_INSTALMENT', lambda x: x.max() - x.min()),
    delay_days_mean=('delay_days', 'mean'),
    delay_days_max=('delay_days', 'max'),
    delay_freq_mean=('delay_days', lambda x: (x > 0).mean()),

    # amount sevirity / frequency
    underpay_amt_max=('underpay_amt', 'max'),
    underpay_freq_mean=('underpay_amt', lambda x: (x > 0).mean()),
    overpay_freq_mean=('underpay_amt', lambda x: (x <= 0).mean()),
    payment_ratio_min=('payment_ratio', 'min'),

    # timing / amount consistency
    delay_days_std=('delay_days', 'std'),
    payment_ratio_std=('payment_ratio', 'std'),

    # recency behaviour
    delay_recent_max=('delay_days', lambda x: x.iloc[:LAST_K].max()),
    delay_recent_mean=('delay_days', lambda x: x.iloc[:LAST_K].mean()),
    underpay_recent_freq=('underpay_amt', lambda x: (x.iloc[:LAST_K] > 0).mean()),

    # Missing data handling
    unpaid_inst_cnt=('DAYS_ENTRY_PAYMENT', lambda x: x.isna().sum())
).reset_index()

KeyError: "Column(s) ['delay_days', 'payment_ratio', 'underpay_amt'] do not exist"

In [17]:
installment_agg.to_csv(INTERIM_INSTAL, index=False)

In [8]:
installment_agg = pd.read_csv(INTERIM_INSTAL)

In [9]:
previous_df = previous_df.merge(
    installment_agg,
    how='left',
    on='SK_ID_PREV'
)

---
#### Credit card aggregation

In [10]:
credit_card_df = pd.read_csv(CREDIT_CARD_BALANCE_PATH)

In [11]:
credit_card_df.sort_values(['SK_ID_PREV', 'MONTHS_BALANCE'], inplace=True, ascending=False)

In [12]:
credit_card_df = credit_card_df.assign(
    balance_rate=credit_card_df['AMT_BALANCE'] / credit_card_df['AMT_CREDIT_LIMIT_ACTUAL'],
    spend_rate=credit_card_df['AMT_DRAWINGS_CURRENT'] / credit_card_df['AMT_BALANCE'],
    interest_rate=(credit_card_df['AMT_RECIVABLE'] - credit_card_df['AMT_RECEIVABLE_PRINCIPAL']) / credit_card_df['AMT_RECEIVABLE_PRINCIPAL'],
    cost_ratio=credit_card_df['AMT_TOTAL_RECEIVABLE'] / credit_card_df['AMT_RECEIVABLE_PRINCIPAL'],
    loan_rate=credit_card_df['AMT_BALANCE'] / credit_card_df['AMT_RECEIVABLE_PRINCIPAL'],
    current_debt=np.where(
        (credit_card_df['NAME_CONTRACT_STATUS'] == 'Active') & (credit_card_df['MONTHS_BALANCE'] == -1),
        credit_card_df['AMT_TOTAL_RECEIVABLE'], 0),
    has_credit_limit = (credit_card_df['AMT_CREDIT_LIMIT_ACTUAL'] > 0 ).astype(int),
    zero_balance=(credit_card_df['AMT_BALANCE'] == 0).astype(int),
    zero_principal=(credit_card_df['AMT_RECEIVABLE_PRINCIPAL'] == 0).astype(int)
)

credit_card_df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [None]:
credit_card_agg = credit_card_df.groupby('SK_ID_PREV').agg(
    has_credit_card=('MONTHS_BALANCE', 'count'),

    # Ammount related aggregation
    cc_limit_max=('AMT_CREDIT_LIMIT_ACTUAL', 'max'),
    cc_spend_mean=('spend_rate', 'mean'),
    cc_balance_rate=('balance_rate', 'mean'),
    cc_payment_mean=('AMT_PAYMENT_TOTAL_CURRENT', 'mean'),
    
    # credit principal (interest / cost / loan rate)
    cc_interest_mean=('interest_rate', 'mean'),
    cc_cost_mean=('cost_ratio', 'mean'),
    cc_cost_worst=('cost_ratio', 'max'),

    cc_loan_rate_mean=('loan_rate', 'mean'),
    cc_loan_rate_worst=('loan_rate', 'min'),

    # Days past due aggregation
    cc_dpd_max=('SK_DPD', 'max'),
    cc_dpd=('SK_DPD', 'sum'),
    cc_dpd_freq=('SK_DPD', lambda x: (x > 0).mean()),

    cc_dpd_def_max=('SK_DPD_DEF', 'max'),
    cc_dpd_def=('SK_DPD_DEF', 'sum'),
    cc_dpd_def_freq=('SK_DPD_DEF', lambda x: (x > 0).mean()),

    # draw / intallment times 
    cc_draw_mean=('CNT_DRAWINGS_CURRENT', 'mean'),
    cc_inst_mean=('CNT_INSTALMENT_MATURE_CUM', 'mean'),

    cc_has_limit_mean=('has_credit_limit', 'mean'),
    cc_zero_balance_mean=('zero_balance', 'mean'),
    cc_zero_principal_mean=('zero_principal', 'mean'),

    #recency behaviour
    cc_recent_limit_max=('AMT_CREDIT_LIMIT_ACTUAL', lambda x: x.iloc[:LAST_K].max()),
    cc_recent_dpd_freq=('SK_DPD', lambda x: (x.iloc[:LAST_K] > 0).mean()),
    cc_recent_dpd_max=('SK_DPD', lambda x: x.iloc[:LAST_K].max()),
    cc_recent_balance_rate=('balance_rate', lambda x: x.iloc[:LAST_K].mean()),
    cc_recent_loan_rate=('loan_rate', lambda x: x.iloc[:LAST_K].mean()),
    cc_current_debt=('current_debt', 'sum')
    ).reset_index()

In [14]:
previous_df = previous_df.merge(
    credit_card_agg,
    how='left',
    on='SK_ID_PREV'
)

---
#### Previous application aggregation

In [15]:
previous_df.sort_values(['SK_ID_CURR', 'DAYS_DECISION'], inplace=True, ascending=False)

In [16]:
DATE_COLS = [
    'DAYS_FIRST_DRAWING',
    'DAYS_FIRST_DUE',
    'DAYS_LAST_DUE_1ST_VERSION',
    'DAYS_LAST_DUE',
    'DAYS_TERMINATION'
]

previous_df[DATE_COLS] = previous_df[DATE_COLS].replace(365243, np.nan)

In [None]:
previous_df['is_approved'] = previous_df[previous_df['NAME_CONTRACT_STATUS'].isin(['Approved', 'Unused offer'])].astype(int)
previous_df['is_refused'] = previous_df[previous_df['NAME_CONTRACT_STATUS'] == 'Refused'].astype(int)

previous_df = previous_df.assign(
    approved_annuity=previous_df['AMT_ANNUITY'] * previous_df['is_approved'],
    approved_credit=previous_df['AMT_CREDIT'].where(previous_df['is_approved'] == 1),
    refused_credit=previous_df['AMT_CREDIT'].where(previous_df['is_refused'] == 1),
    request_diff=previous_df['AMT_APPLICATION'] - previous_df['AMT_CREDIT'],
    was_disbursed=previous_df['DAYS_FIRST_DRAWING'].where(previous_df['is_refused' == 0]).notna().astype(int),
    grace_period=previous_df['DAYS_FIRST_DUE'] - previous_df['DAYS_FIRST_DRAWING'],
    delay=previous_df['DAYS_TERMINATION'] - previous_df['DAYS_LAST_DUE_1ST_VERSION'],
    prolongation=previous_df['DAYS_LAST_DUE'] - previous_df['DAYS_LAST_DUE_1ST_VERSION'],
    loan_duration=previous_df['DAYS_LAST_DUE_1ST_VERSION'] - previous_df['DAYS_FIRST_DUE'],
    unpaid_ratio=previous_df['unpaid_inst_cnt'] / previous_df['inst_cnt']
)

In [24]:
previous_df['CODE_REJECT_REASON'].value_counts().reset_index()

Unnamed: 0,CODE_REJECT_REASON,count
0,XAP,1353093
1,HC,175231
2,LIMIT,55680
3,SCO,37467
4,CLIENT,26436
5,SCOFR,12811
6,XNA,5244
7,VERIF,3535
8,SYSTEM,717


In [14]:
previous_df['NAME_YIELD_GROUP'].value_counts()

NAME_YIELD_GROUP
XNA           517215
middle        385532
high          353331
low_normal    322095
low_action     92041
Name: count, dtype: int64

In [None]:
previous_agg = previous_df.groupby('SK_ID_CURR').agg(
    # POS_CASH_balance related aggregation
    prev_PC_installment_mean=('max_cnt_installment', 'mean'),
    prev_dpd_max=('sk_dpd_max', 'max'),
    prev_PC_dpd_mean=('sk_dpd_days', 'mean'),
    prev_PC_dpd_def_max=('sk_dpd_def_max', 'max'),
    prev_PC_dpd_def_mean=('sk_dpd_def_days', 'mean'),
    prev_PC_completed_mean=('is_completed', 'mean'),

    #Installments payments related aggregation
        # timing sevirity / frequency
    prev_inst_mean=('inst_cnt', 'mean'),
    prev_inst_duration_max=('payments_duration', 'max'),
    prev_inst_duration_mean=('payments_duration', 'mean'),
    prev_inst_delay_max=('delay_days_max', 'max'),
    prev_inst_delay_mean=('delay_days_mean', 'mean'),
    prev_inst_delay_freq_mean=('delay_freq_mean', 'mean'),
        # amount sevirity / frequency
    prev_inst_underpay_amt_max=('underpay_amt_max', 'max'),
    prev_inst_underpay_freq_mean=('underpay_freq_mean', 'mean'),
    prev_inst_overpay_freq_mean=('overpay_freq_mean', 'mean'),
    prev_inst_payment_ratio_mean=('payment_ratio_min', 'mean'),
    prev_inst_payment_ratio_worst=('payment_ratio_min', 'min'),
        # timing / amount consistency
    prev_inst_delay_std_max=('delay_days_std', 'max'),
    prev_inst_delay_std_mean=('delay_days_std', 'mean'),
    prev_inst_delay_std_std=('delay_days_std', 'std'),
    prev_inst_pay_ratio_std_max=('payment_ratio_std', 'max'),
    prev_inst_pay_ratio_std_mean=('payment_ratio_std', 'mean'),
    prev_inst_pay_ratio_std_std=('payment_ratio_std', 'std'),
        # recency behaviour
    prev_inst_delay_recent_max=('delay_recent_max', 'max'),
    prev_inst_delay_recent_mean=('delay_recent_mean', 'mean'),
    prev_inst_underpay_recent_freq=('underpay_recent_freq', 'mean'),
        # Missing data handling
    prev_inst_unpaid_ratio_max=('unpaid_ratio', 'max'),
    prev_inst_unpaid_ratio_mean=('unpaid_ratio', 'mean'),

    #Credit card related aggregation
    has_credit_card=('has_credit_card', lambda x: int((x > 0).any())),
        # ammount related aggregation
    prev_cc_limit_max=('cc_limit_max', 'max'),
    prev_cc_spend_mean=('cc_spend_mean', 'mean'),
    prev_cc_balance_rate_mean=('cc_balance_rate', 'mean'),
    prev_cc_balance_rate_max=('cc_balance_rate', 'max'),
    prev_cc_payment_mean=('cc_payment_mean', 'mean'),
        # credit principal (interest / cost / loan rate)
    prev_cc_interest_mean=('cc_interest_mean', 'mean'),
    prev_cc_cost_mean=('cc_cost_mean', 'mean'),
    prev_cc_cost_mean=('cc_cost_worst', 'max'),
    prev_cc_loan_rate_mean=('cc_loan_rate_mean', 'mean'),
    prev_cc_loan_rate_worst=('cc_loan_rate_worst', 'min'),
        # days past due aggregation
    prev_cc_dpd_max=('cc_dpd_max', 'max'),
    prev_cc_dpd_mean=('cc_dpd', 'mean'),
    prev_cc_dpd_freq=('cc_dpd_freq', 'mean'),
    prev_cc_dpd_def_max=('cc_dpd_def_max', 'max'),
    prev_cc_dpd_def=('cc_dpd_def', 'mean'),
    prev_cc_dpd_def_freq=('cc_dpd_def_freq', 'mean'),
        # draw / intallment times 
    prev_cc_draw_mean=('cc_draw_mean', 'mean'),
    prev_cc_inst_mean=('cc_inst_mean', 'mean'),
    prev_cc_has_limit_mean=('cc_has_limit_mean', 'mean'),
    prev_cc_zero_balance_mean=('cc_zero_balance_mean'),
    prev_cc_zero_principal_mean=('cc_zero_principal_mean', 'mean'),
        #recency behaviour
    prev_cc_recent_limit_max=('cc_recent_limit_max', 'max'),
    prev_cc_recent_dpd_freq=('cc_recent_dpd_freq', 'mean'),
    prev_cc_recent_dpd_max=('cc_recent_dpd_max', 'max'),
    prev_cc_recent_balance_rate=('cc_recent_balance_rate', 'mean'),
    prev_cc_recent_loan_rate=('cc_recent_loan_rate', 'mean'),
    cc_current_debt=('cc_current_debt', 'sum'),


    # Annuity / credit amt 
    prev_annuity_mean=('approved_annuity', 'mean'),
    prev_annuity_max=('approved_annuity', 'max'),
    prev_credit_mean=('approved_credit', 'mean'),
    prev_credit_max=('approved_credit', 'max'),
    prev_refused_credit_mean=('refused_credit', 'mean'),
    prev_refused_credit_max=('refused_credit', 'max'),
    prev_request_diff_mean=('request_diff', 'mean'),

    # Approved / refused times
    prev_applic_cnt=('SK_ID_PREV', 'count'),
    prev_approved_cnt=('is_approved', 'sum'),
    prev_refused_cnt=('is_refused', 'sum'),

    # Downpayment aggregation
    prev_downpayment_mean=('AMT_DOWN_PAYMENT', 'mean'),
    prev_downpayment_max=('AMT_DOWN_PAYMENT', 'max'),
    prev_no_downpayment_mean=('AMT_DOWN_PAYMENT', lambda x: x.isna().mean()),
    prev_downpayment_rate_mean=('RATE_DOWN_PAYMENT', 'mean'),
    prev_downpayment_rate_max=('RATE_DOWN_PAYMENT', 'max'),

    # Reject reason binning
    prev_refused_sco_cnt=('CODE_REJECT_REASON', lambda x: x.isin(['SCO', 'SCOFR']).sum()),
    prev_refused_limit_cnt=('CODE_REJECT_REASON', lambda x: (x == 'LIMIT').sum()),
    prev_refused_hc_cnt=('CODE_REJECT_REASON', lambda x: x.isin(['HC', 'SYSTEM']).sum()),
    prev_refused_client_cnt=('CODE_REJECT_REASON', lambda x: (x == 'CLIENT').sum()),
    

    # Recent behaviour
    prev_last_application=('DAYS_DECISION', 'min'),
    prev_recent_applications_cnt=('DAYS_DECISION', lambda x: (x >= -YEAR).count()),
    prev_recent_approved=('is_approved', lambda x: x[:LAST_K].sum()),

    # Yield group binning
    prev_high_yield_cnt=('NAME_YIELD_GROUP', lambda x: (x == 'high').sum()),
    prev_middle_yield_cnt=('NAME_YIELD_GROUP', lambda x: (x == 'middle').sum()),
    prev_low_yeild_cnt=('NAME_YIELD_GROUP', lambda x: x.isin(['low_normal', 'low_action']).sum()),
    
    # Time related aggregation
    prev_was_disbursed=('was_disbursed', 'mean'),
    prev_grace_period_mean=('grace_period', 'mean'),
    prev_delay_mean=('delay', 'mean'), 
    prev_delay_max=('delay', 'max'),
    prev_prolong_mean=('prolongation', 'mean'),
    prev_prolong_max=('prolongation', 'mean'),
    prev_loan_duration_mean=('loan_duration', 'mean'),
    prev_loan_duration_max=('loan_duration', 'max'),

    # Other aggregation
    prev_insured_mean=('NFLAG_INSURED_ON_APPROVAL', 'mean'),

    prev_type_pos=('NAME_PORTFOLIO', lambda x: (x == 'POS').sum()),
    prev_type_cash=('NAME_PORTFOLIO', lambda x: (x == 'Cash').sum()),

    prev_cnt_payment_max=('CNT_PAYMENT', 'max'),
    prev_cnt_paymnet_mean=('CNT_PAYMENT', 'mean'),
).reset_index()