In [15]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

# 1. PROCESS BUREAU AND BUREAU_BALANCE DATA
def process_bureau_and_balance(df):
    """Processes bureau.csv and bureau_balance.csv to create aggregated features."""
    bureau = pd.read_csv('../data/raw/bureau.csv')
    bureau_balance = pd.read_csv('../data/raw/bureau_balance.csv')

    # Aggregate bureau_balance by SK_ID_BUREAU
    bb_agg = bureau_balance.groupby('SK_ID_BUREAU').agg({
        'MONTHS_BALANCE': ['min', 'max', 'count'],
        'STATUS': ['first', 'last']
    }).reset_index()
    bb_agg.columns = ['_'.join(col).strip('_') for col in bb_agg.columns.values]
    
    # Merge aggregated bureau_balance into bureau
    bureau = bureau.merge(bb_agg, how='left', on='SK_ID_BUREAU')
    
    # Aggregate bureau by SK_ID_CURR
    bureau_agg = bureau.groupby('SK_ID_CURR').agg({
        'SK_ID_BUREAU': 'count',
        'CREDIT_DAY_OVERDUE': ['mean', 'max'],
        'AMT_CREDIT_SUM_DEBT': 'sum',
        'AMT_CREDIT_SUM_OVERDUE': 'sum',
        'CREDIT_TYPE': 'nunique'
    }).reset_index()
    bureau_agg.columns = ['SK_ID_CURR'] + [f'BUREAU_{col[0]}_{col[1].upper()}' for col in bureau_agg.columns[1:]]
    
    return df.merge(bureau_agg, on='SK_ID_CURR', how='left')

# 2. PROCESS PREVIOUS APPLICATIONS DATA
def process_previous_applications(df):
    """Processes previous_application.csv for historical application features."""
    prev_app = pd.read_csv('../data/raw/previous_application.csv')
    
    # Feature engineering for previous applications
    prev_app['APP_CREDIT_PERC'] = prev_app['AMT_APPLICATION'] / prev_app['AMT_CREDIT']
    
    # Aggregate by SK_ID_CURR
    prev_app_agg = prev_app.groupby('SK_ID_CURR').agg({
        'SK_ID_PREV': 'count',
        'AMT_ANNUITY': ['mean', 'max'],
        'AMT_APPLICATION': ['mean', 'max'],
        'APP_CREDIT_PERC': ['mean', 'max', 'min']
    }).reset_index()
    prev_app_agg.columns = ['SK_ID_CURR'] + [f'PREV_APP_{col[0]}_{col[1].upper()}' for col in prev_app_agg.columns[1:]]

    return df.merge(prev_app_agg, on='SK_ID_CURR', how='left')

# 3. PROCESS INSTALLMENTS PAYMENTS DATA (CRUCIAL FOR DPD)
def process_installments_payments(df):
    """Processes installments_payments.csv to get DPD and payment history."""
    installments = pd.read_csv('../data/raw/installments_payments.csv')

    # Calculate DPD and payment difference
    installments['DPD'] = installments['DAYS_ENTRY_PAYMENT'] - installments['DAYS_INSTALMENT']
    installments['DBD'] = installments['DAYS_INSTALMENT'] - installments['DAYS_ENTRY_PAYMENT']
    installments['PAYMENT_PERC'] = installments['AMT_PAYMENT'] / installments['AMT_INSTALMENT']
    installments['PAYMENT_DIFF'] = installments['AMT_INSTALMENT'] - installments['AMT_PAYMENT']
    
    # Flag late payments
    installments['DPD'] = installments['DPD'].apply(lambda x: x if x > 0 else 0)
    installments['DBD'] = installments['DBD'].apply(lambda x: x if x > 0 else 0)

    # Aggregate by SK_ID_CURR
    installments_agg = installments.groupby('SK_ID_CURR').agg({
        'DPD': ['mean', 'max', 'sum'],
        'DBD': ['mean', 'max', 'sum'],
        'PAYMENT_PERC': ['mean', 'max', 'min'],
        'PAYMENT_DIFF': ['mean', 'max', 'sum']
    }).reset_index()
    installments_agg.columns = ['SK_ID_CURR'] + [f'INSTALLMENTS_{col[0]}_{col[1].upper()}' for col in installments_agg.columns[1:]]

    return df.merge(installments_agg, on='SK_ID_CURR', how='left')

# MAIN FUNCTION TO BUILD THE DATASET
def build_complete_dataset():
    """Loads base data and merges all engineered features."""
    df_train = pd.read_csv('../data/raw/application_train.csv')
    df_test = pd.read_csv('../data/raw/application_test.csv')
    df = pd.concat([df_train, df_test], ignore_index=True)
    df = df[['SK_ID_CURR','TARGET','NAME_CONTRACT_TYPE','AMT_INCOME_TOTAL','AMT_CREDIT','NAME_INCOME_TYPE','NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS','NAME_HOUSING_TYPE','OCCUPATION_TYPE','EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3']]

    print("Processing Bureau Data...")
    df = process_bureau_and_balance(df)
    
    print("Processing Previous Applications...")
    df = process_previous_applications(df)
    
    print("Processing Installments Payments...")
    df = process_installments_payments(df)
    
    # You can add calls to process credit_card_balance and POS_CASH_balance here as well
    
    print("Dataset build complete.")
    return df

# Run the pipeline
complete_df = build_complete_dataset()
print("Shape of the final dataset:", complete_df.shape)
print("Columns created:", [col for col in complete_df.columns if 'BUREAU_' in col or 'PREV_APP_' in col or 'INSTALLMENTS_' in col])

Processing Bureau Data...
Processing Previous Applications...
Processing Installments Payments...
Dataset build complete.
Shape of the final dataset: (356255, 39)
Columns created: ['BUREAU_SK_ID_BUREAU_COUNT', 'BUREAU_CREDIT_DAY_OVERDUE_MEAN', 'BUREAU_CREDIT_DAY_OVERDUE_MAX', 'BUREAU_AMT_CREDIT_SUM_DEBT_SUM', 'BUREAU_AMT_CREDIT_SUM_OVERDUE_SUM', 'BUREAU_CREDIT_TYPE_NUNIQUE', 'PREV_APP_SK_ID_PREV_COUNT', 'PREV_APP_AMT_ANNUITY_MEAN', 'PREV_APP_AMT_ANNUITY_MAX', 'PREV_APP_AMT_APPLICATION_MEAN', 'PREV_APP_AMT_APPLICATION_MAX', 'PREV_APP_APP_CREDIT_PERC_MEAN', 'PREV_APP_APP_CREDIT_PERC_MAX', 'PREV_APP_APP_CREDIT_PERC_MIN', 'INSTALLMENTS_DPD_MEAN', 'INSTALLMENTS_DPD_MAX', 'INSTALLMENTS_DPD_SUM', 'INSTALLMENTS_DBD_MEAN', 'INSTALLMENTS_DBD_MAX', 'INSTALLMENTS_DBD_SUM', 'INSTALLMENTS_PAYMENT_PERC_MEAN', 'INSTALLMENTS_PAYMENT_PERC_MAX', 'INSTALLMENTS_PAYMENT_PERC_MIN', 'INSTALLMENTS_PAYMENT_DIFF_MEAN', 'INSTALLMENTS_PAYMENT_DIFF_MAX', 'INSTALLMENTS_PAYMENT_DIFF_SUM']


Cleaning

In [16]:
# Create c_EXT_SOURCE using coalesce logic
complete_df['c_EXT_SOURCE'] = complete_df['EXT_SOURCE_1'].fillna(
    complete_df['EXT_SOURCE_2'].fillna(
        complete_df['EXT_SOURCE_3']
    )
)

# Reorder columns to place c_EXT_SOURCE before EXT_SOURCE_1
cols = list(complete_df.columns)
ext_source_1_idx = cols.index('EXT_SOURCE_1')
c_ext_source_idx = cols.index('c_EXT_SOURCE')
cols.pop(c_ext_source_idx)
cols.insert(ext_source_1_idx, 'c_EXT_SOURCE')
complete_df = complete_df[cols]

Issue Monitoring

In [93]:
# issue1: if no Target then remove
def issue_rate(issue):
    rate = round(len(issue)/len(complete_df),2)
    return print(f"Issue rate: {rate}% ({len(issue)}/{len(complete_df)})")

a = complete_df['TARGET'].isna()
issue1 = complete_df[a]
issue_rate(issue1)
complete_df = complete_df[~a]

# if all three EXT_SOURCE are NaN then remove
a = complete_df['c_EXT_SOURCE'].isna()
issue2 = complete_df[a]
issue_rate(issue2)
complete_df = complete_df[~a]

# if BUREAU_CREDIT_DAY_OVERDUE_MEAN is NaN then remove
a = complete_df['BUREAU_CREDIT_DAY_OVERDUE_MEAN'].isna()
issue3 = complete_df[a]
issue_rate(issue3)
complete_df = complete_df[~a]

# if INSTALLMENTS_DPD_MEAN is NaN then remove
a = complete_df['INSTALLMENTS_DPD_MEAN'].isna()
issue4 = complete_df[a]
issue_rate(issue4)
complete_df = complete_df[~a]

Issue rate: 0.14% (48744/356255)
Issue rate: 0.0% (172/307511)
Issue rate: 0.14% (43932/307339)
Issue rate: 0.14% (43932/307339)
Issue rate: 0.05% (13436/263407)
Issue rate: 0.05% (13436/263407)


clean.py Sanity Check

In [12]:
from mlport.common.merge import merge_all_features

# Load raw data (paths can be adjusted)
app = pd.read_csv("../data/raw/application_train.csv")
bureau = pd.read_csv("../data/raw/bureau.csv")
prev = pd.read_csv("../data/raw/previous_application.csv")
inst = pd.read_csv("../data/raw/installments_payments.csv")

# Merge
merged = merge_all_features(app, bureau, prev, inst)

# Sanity Check
# 1. One row per applicant
assert merged["SK_ID_CURR"].is_unique

# 2. Quick look at missingness in engineered blocks
print("Bureau null ratio:")
print(merged.filter(like="BUREAU_").isna().mean().head())

print("Previous application null ratio:")
print(merged.filter(like="PREV_APP_").isna().mean().head())

print("Installments null ratio:")
print(merged.filter(like="INSTALLMENTS_").isna().mean().head())

# 3. Preview some rows
merged.head()

Bureau null ratio:
AMT_REQ_CREDIT_BUREAU_HOUR    0.135016
AMT_REQ_CREDIT_BUREAU_DAY     0.135016
AMT_REQ_CREDIT_BUREAU_WEEK    0.135016
AMT_REQ_CREDIT_BUREAU_MON     0.135016
AMT_REQ_CREDIT_BUREAU_QRT     0.135016
dtype: float64
Previous application null ratio:
PREV_APP_SK_ID_PREV_COUNT        0.053507
PREV_APP_AMT_ANNUITY_MEAN        0.054863
PREV_APP_AMT_ANNUITY_MAX         0.054863
PREV_APP_AMT_APPLICATION_MEAN    0.053507
PREV_APP_AMT_APPLICATION_MAX     0.053507
dtype: float64
Installments null ratio:
INSTALLMENTS_DPD_MEAN    0.051627
INSTALLMENTS_DPD_MAX     0.051627
INSTALLMENTS_DPD_SUM     0.051601
INSTALLMENTS_DBD_MEAN    0.051627
INSTALLMENTS_DBD_MAX     0.051627
dtype: float64


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,INSTALLMENTS_DPD_SUM,INSTALLMENTS_DBD_MEAN,INSTALLMENTS_DBD_MAX,INSTALLMENTS_DBD_SUM,INSTALLMENTS_PAYMENT_PERC_MEAN,INSTALLMENTS_PAYMENT_PERC_MAX,INSTALLMENTS_PAYMENT_PERC_MIN,INSTALLMENTS_PAYMENT_DIFF_MEAN,INSTALLMENTS_PAYMENT_DIFF_MAX,INSTALLMENTS_PAYMENT_DIFF_SUM
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0.0,20.421053,31.0,388.0,1.0,1.0,1.0,0.0,0.0,0.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0.0,7.16,14.0,179.0,1.0,1.0,1.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0.0,7.666667,11.0,23.0,1.0,1.0,1.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0.0,19.375,77.0,310.0,1.0,1.0,1.0,0.0,0.0,0.0
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,63.0,4.590909,31.0,303.0,0.954545,1.0,5e-05,452.384318,22655.655,29857.365


## Merge

In [1]:
import pandas as pd
import numpy as np
import warnings
from mlport.common.data import load_any
from mlport.common.merge import merge_all_features
from mlport.common.clean import clean_for_model

In [2]:
app = load_any("../data/raw/application_train.csv")
bureau = load_any("../data/raw/bureau.csv")
prev = load_any("../data/raw/previous_application.csv")
inst = load_any("../data/raw/installments_payments.csv")

In [3]:
df = merge_all_features(app, bureau, prev, inst)
df = clean_for_model(df)

In [4]:
df

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,AMT_INCOME_TOTAL,AMT_CREDIT,NAME_INCOME_TYPE,DAYS_BIRTH,c_EXT_SOURCE,BUREAU_SK_ID_BUREAU_COUNT,BUREAU_CREDIT_DAY_OVERDUE_MEAN,...,INSTALLMENTS_DPD_SUM,INSTALLMENTS_DBD_MEAN,INSTALLMENTS_DBD_MAX,INSTALLMENTS_DBD_SUM,INSTALLMENTS_PAYMENT_PERC_MEAN,INSTALLMENTS_PAYMENT_PERC_MAX,INSTALLMENTS_PAYMENT_PERC_MIN,INSTALLMENTS_PAYMENT_DIFF_MEAN,INSTALLMENTS_PAYMENT_DIFF_MAX,INSTALLMENTS_PAYMENT_DIFF_SUM
0,100002,1,cash loans,202500.0,406597.5,working,-9461,0.083037,8.0,0.0,...,0.0,20.421053,31.0,388.0,1.000000,1.00000,1.000000,0.000000,0.000,0.000
1,100003,0,cash loans,270000.0,1293502.5,state servant,-16765,0.311267,4.0,0.0,...,0.0,7.160000,14.0,179.0,1.000000,1.00000,1.000000,0.000000,0.000,0.000
2,100004,0,revolving loans,67500.0,135000.0,working,-19046,0.555912,2.0,0.0,...,0.0,7.666667,11.0,23.0,1.000000,1.00000,1.000000,0.000000,0.000,0.000
3,100006,0,cash loans,135000.0,312682.5,working,-19005,0.650442,,,...,0.0,19.375000,77.0,310.0,1.000000,1.00000,1.000000,0.000000,0.000,0.000
4,100007,0,cash loans,121500.0,513000.0,working,-19932,0.322738,1.0,0.0,...,63.0,4.590909,31.0,303.0,0.954545,1.00000,0.000050,452.384318,22655.655,29857.365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,cash loans,157500.0,254700.0,working,-9327,0.145570,,,...,0.0,36.285714,46.0,254.0,1.000000,1.00000,1.000000,0.000000,0.000,0.000
307507,456252,0,cash loans,72000.0,269550.0,pensioner,-20775,0.115992,,,...,3.0,3.333333,11.0,20.0,1.000000,1.00000,1.000000,0.000000,0.000,0.000
307508,456253,0,cash loans,153000.0,677664.0,working,-14966,0.744026,4.0,0.0,...,9.0,15.142857,51.0,212.0,0.928571,1.00000,0.006864,283.792500,3945.825,3973.095
307509,456254,1,cash loans,171000.0,370107.0,commercial associate,-11961,0.514163,1.0,0.0,...,0.0,19.000000,31.0,361.0,1.000000,1.00000,1.000000,0.000000,0.000,0.000


## make_dataset

In [8]:
import pandas as pd
train = pd.read_parquet("../data/processed/train.parquet", engine="fastparquet")
test  = pd.read_parquet("../data/processed/test.parquet",  engine="fastparquet")


In [10]:
test

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_INCOME_TOTAL,AMT_CREDIT,NAME_INCOME_TYPE,DAYS_BIRTH,c_EXT_SOURCE,BUREAU_SK_ID_BUREAU_COUNT,BUREAU_CREDIT_DAY_OVERDUE_MEAN,BUREAU_CREDIT_DAY_OVERDUE_MAX,...,INSTALLMENTS_DPD_SUM,INSTALLMENTS_DBD_MEAN,INSTALLMENTS_DBD_MAX,INSTALLMENTS_DBD_SUM,INSTALLMENTS_PAYMENT_PERC_MEAN,INSTALLMENTS_PAYMENT_PERC_MAX,INSTALLMENTS_PAYMENT_PERC_MIN,INSTALLMENTS_PAYMENT_DIFF_MEAN,INSTALLMENTS_PAYMENT_DIFF_MAX,INSTALLMENTS_PAYMENT_DIFF_SUM
0,100001,cash loans,135000.0,568800.0,working,-19241,0.752614,7.0,0.0,0.0,...,11.0,8.857143,36.0,62.0,1.000000,1.0,1.000000,0.000000,0.00,0.000
1,100005,cash loans,99000.0,222768.0,working,-18064,0.564990,3.0,0.0,0.0,...,1.0,23.666667,37.0,213.0,1.000000,1.0,1.000000,0.000000,0.00,0.000
2,100013,cash loans,202500.0,663264.0,working,-20038,0.699787,4.0,0.0,0.0,...,84.0,5.722581,38.0,887.0,0.935484,1.0,0.000266,1157.662742,23147.82,179437.725
3,100028,cash loans,315000.0,1575000.0,working,-13976,0.525734,12.0,0.0,0.0,...,30.0,3.265487,19.0,369.0,0.911504,1.0,0.030496,622.550708,8505.00,70348.230
4,100038,cash loans,180000.0,625500.0,working,-13040,0.202145,,,,...,0.0,12.250000,18.0,147.0,1.000000,1.0,1.000000,0.000000,0.00,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48732,456221,cash loans,121500.0,412560.0,working,-19970,0.648575,5.0,0.0,0.0,...,0.0,7.333333,8.0,22.0,1.000000,1.0,1.000000,0.000000,0.00,0.000
48733,456222,cash loans,157500.0,622413.0,commercial associate,-11186,0.684596,,,,...,38.0,6.000000,26.0,438.0,0.958904,1.0,0.000643,314.714589,9680.49,22974.165
48734,456223,cash loans,202500.0,315000.0,commercial associate,-15922,0.733503,5.0,0.0,0.0,...,0.0,31.625000,76.0,253.0,1.000000,1.0,1.000000,0.000000,0.00,0.000
48735,456224,cash loans,225000.0,450000.0,commercial associate,-13968,0.373090,17.0,0.0,0.0,...,1.0,7.275862,15.0,211.0,1.000000,1.0,1.000000,0.000000,0.00,0.000
