In [3]:
from pathlib import Path
from IPython.display import display
from pyspark.sql import SparkSession

# ── settings you can tweak ───────────────────────────────────────────────
SNAPSHOT    = "2024-01-01"          # the month you want to preview
ROWS_TO_SHOW = 20                   # how many rows per table to display
TABLES = [
    "lms_loan_daily",
    "feature_clickstream",
    "features_attributes",
    "features_financials",
]

# base path is relative to the notebook (we assume the notebook is
# somewhere *inside* code_artefacts or above it)
SILVER_ROOT = Path("data_mart", "silver")

# ── Spark session (reuse an existing one) ────────────────────────────────
spark = spark if "spark" in globals() else SparkSession.builder.getOrCreate()

# ── loop & preview ───────────────────────────────────────────────────────
for tbl in TABLES:
    # code_artefacts/data_mart/silver/<table>/snapshot_date=YYYY-MM-DD/
    folder = SILVER_ROOT / tbl / f"snapshot_date={SNAPSHOT}"

    if not folder.exists():
        print(f"⚠️  {tbl}: {folder} does not exist (snapshot not generated?)")
        continue

    # recursive read pulls in all part-*.parquet files
    sdf = spark.read.option("recursiveFileLookup", "true").parquet(str(folder))
    pdf = sdf.limit(ROWS_TO_SHOW).toPandas()

    print(f"\n=== {tbl}  (total rows: {sdf.count()}, showing {len(pdf)}) ===")
    display(pdf)


=== lms_loan_daily  (total rows: 5412, showing 20) ===


Unnamed: 0,loan_id,Customer_ID,loan_start_date,tenure,installment_num,loan_amt,due_amt,paid_amt,overdue_amt,balance,snapshot_date,mob,installments_missed,first_missed_date,dpd
0,CUS_0x1000_2023_05_01,CUS_0x1000,2023-05-01,10,8,10000,1000.0,0.0,4000.0,6000.0,2024-01-01,8,4,2023-09-01,122
1,CUS_0x1011_2023_11_01,CUS_0x1011,2023-11-01,10,2,10000,1000.0,1000.0,0.0,8000.0,2024-01-01,2,0,,0
2,CUS_0x1013_2023_12_01,CUS_0x1013,2023-12-01,10,1,10000,1000.0,1000.0,0.0,9000.0,2024-01-01,1,0,,0
3,CUS_0x1015_2023_08_01,CUS_0x1015,2023-08-01,10,5,10000,1000.0,1000.0,0.0,5000.0,2024-01-01,5,0,,0
4,CUS_0x1018_2023_11_01,CUS_0x1018,2023-11-01,10,2,10000,1000.0,2000.0,0.0,8000.0,2024-01-01,2,0,,0
5,CUS_0x1026_2023_10_01,CUS_0x1026,2023-10-01,10,3,10000,1000.0,1000.0,0.0,7000.0,2024-01-01,3,0,,0
6,CUS_0x102d_2024_01_01,CUS_0x102d,2024-01-01,10,0,10000,0.0,0.0,0.0,10000.0,2024-01-01,0,0,,0
7,CUS_0x1032_2023_08_01,CUS_0x1032,2023-08-01,10,5,10000,1000.0,1000.0,0.0,5000.0,2024-01-01,5,0,,0
8,CUS_0x1041_2023_11_01,CUS_0x1041,2023-11-01,10,2,10000,1000.0,1000.0,0.0,8000.0,2024-01-01,2,0,,0
9,CUS_0x1044_2023_06_01,CUS_0x1044,2023-06-01,10,7,10000,1000.0,1000.0,0.0,3000.0,2024-01-01,7,0,,0



=== feature_clickstream  (total rows: 8974, showing 20) ===


Unnamed: 0,fe_1,fe_2,fe_3,fe_4,fe_5,fe_6,fe_7,fe_8,fe_9,fe_10,...,fe_14,fe_15,fe_16,fe_17,fe_18,fe_19,fe_20,Customer_ID,snapshot_date,clickstream_mean
0,239.0,140.0,-24.0,265.0,2.0,-32.0,147.0,-38.0,280.0,210.0,...,140.0,45.0,179.0,61.0,38.0,68.0,171.0,CUS_0x1037,2024-01-01,117.8
1,-15.0,137.0,35.0,-9.0,124.0,23.0,-33.0,-66.0,134.0,118.0,...,52.0,30.0,214.0,-125.0,176.0,128.0,209.0,CUS_0x1069,2024-01-01,89.6
2,361.0,58.0,-37.0,150.0,17.0,40.0,189.0,-64.0,119.0,32.0,...,-32.0,-15.0,34.0,162.0,94.0,-55.0,103.0,CUS_0x114a,2024-01-01,88.0
3,96.0,231.0,-41.0,20.0,147.0,217.0,158.0,-45.0,8.0,300.0,...,-4.0,259.0,190.0,97.0,163.0,-40.0,3.0,CUS_0x1184,2024-01-01,109.15
4,54.0,294.0,-89.0,-43.0,288.0,355.0,57.0,-33.0,157.0,216.0,...,191.0,0.0,-19.0,-51.0,80.0,69.0,-26.0,CUS_0x1297,2024-01-01,105.25
5,308.0,-48.0,188.0,25.0,220.0,115.0,-5.0,205.0,158.0,166.0,...,-54.0,160.0,173.0,84.0,102.0,78.0,98.0,CUS_0x12fb,2024-01-01,118.75
6,16.0,-9.0,45.0,50.0,-47.0,-25.0,23.0,30.0,84.0,239.0,...,-23.0,80.0,247.0,-8.0,224.0,79.0,123.0,CUS_0x1325,2024-01-01,82.65
7,9.0,145.0,57.0,163.0,140.0,237.0,92.0,114.0,72.0,34.0,...,80.0,69.0,68.0,19.0,223.0,155.0,-43.0,CUS_0x1341,2024-01-01,95.45
8,55.0,57.0,49.0,107.0,171.0,53.0,118.0,119.0,25.0,180.0,...,-93.0,-18.0,159.0,202.0,163.0,9.0,-45.0,CUS_0x1375,2024-01-01,67.8
9,-8.0,222.0,-44.0,79.0,112.0,50.0,35.0,125.0,282.0,301.0,...,9.0,164.0,176.0,122.0,59.0,174.0,21.0,CUS_0x13a8,2024-01-01,121.85



=== features_attributes  (total rows: 485, showing 20) ===


Unnamed: 0,Customer_ID,Name,Age,SSN,Occupation,snapshot_date,age_band,has_valid_ssn
0,CUS_0x102d,Neil Chatterjeex,31,692-71-7552,Entrepreneur,2024-01-01,25-34,1
1,CUS_0x1051,Lucia Mutikanin,42,232-33-7638,Engineer,2024-01-01,35-44,1
2,CUS_0x1269,Chadbournk,22,755-70-8952,Manager,2024-01-01,18-24,1
3,CUS_0x1290,Pattanaikb,31,922-15-4739,Architect,2024-01-01,25-34,1
4,CUS_0x12d1,Hirschlers,41,618-45-3370,Accountant,2024-01-01,35-44,1
5,CUS_0x12d5,ita Bosed,28,518-01-2262,Scientist,2024-01-01,25-34,1
6,CUS_0x133e,Carews,41,914-49-0079,Entrepreneur,2024-01-01,35-44,1
7,CUS_0x1383,Niklasb,19,000-58-0568,Developer,2024-01-01,18-24,1
8,CUS_0x143c,Kumar Singhj,35,422-18-4190,Accountant,2024-01-01,35-44,1
9,CUS_0x14d0,Huwk,22,271-41-2506,Musician,2024-01-01,18-24,1



=== features_financials  (total rows: 485, showing 20) ===


Unnamed: 0,Customer_ID,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,...,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,snapshot_date,is_min_pay_only,min_pay_info_missing,credit_history_months,credit_history_years,dti
0,CUS_0x102d,89064.523438,7256.043333333334,5,3,1.0,1.0,Mortgage Loan,6,5,...,37.57275,296.094116,High_spent_Medium_value_payments,641.937439,2024-01-01,0.0,0,363,30.25,0.00728
1,CUS_0x1051,35022.21875,2859.518333333333,3,5,4.0,1.0,Student Loan,6,11,...,21.214577,259.345123,Low_spent_Small_value_payments,295.392151,2024-01-01,0.0,0,342,28.5,0.028566
2,CUS_0x1269,42031.089844,3762.590833333333,2,1,8.0,3.0,"Not Specified, Debt Consolidation Loan, and Au...",1,5,...,66.50618,108.855858,High_spent_Medium_value_payments,450.897034,2024-01-01,0.0,0,207,17.25,0.001988
3,CUS_0x1290,10455.875,729.3229166666665,6,5,12.0,3.0,"Personal Loan, Payday Loan, and Credit-Builder...",11,19,...,25.748447,24.490505,Low_spent_Medium_value_payments,302.693329,2024-01-01,0.0,0,279,23.25,0.007953
4,CUS_0x12d1,21384.939453,1646.0783333333331,4,4,15.0,2.0,"Credit-Builder Loan, and Debt Consolidation Loan",24,15,...,22.095625,168.245499,Low_spent_Medium_value_payments,254.266708,2024-01-01,1.0,0,324,27.0,0.032337
5,CUS_0x12d5,62320.558594,5068.38,9,9,18.0,5.0,"Debt Consolidation Loan, Home Equity Loan, Aut...",23,15,...,171.345566,171.510864,High_spent_Small_value_payments,423.981567,2024-01-01,,1,23,1.916667,0.057598
6,CUS_0x133e,16626.25,1096.5208333333333,0,2,5.0,0.0,,13,7,...,0.0,53.386192,Low_spent_Small_value_payments,346.2659,2024-01-01,0.0,0,274,22.833334,0.066958
7,CUS_0x1383,80206.640625,6766.886666666666,5,4,20.0,5.0,"Student Loan, Payday Loan, Not Specified, Payd...",11,16,...,260.418976,522.977051,Low_spent_Medium_value_payments,173.292633,2024-01-01,1.0,0,104,8.666667,0.014087
8,CUS_0x143c,18578.880859,1325.1134734621792,1,3,1.0,,"Auto Loan, Credit-Builder Loan, and Auto Loan",15,12,...,140.106873,76.613594,Low_spent_Small_value_payments,317.03006,2024-01-01,,1,378,31.5,0.041956
9,CUS_0x14d0,48089.160156,3902.43,10,8,18.0,5.0,"Auto Loan, Auto Loan, Auto Loan, Not Specified...",59,19,...,192.740204,61.60207,High_spent_Large_value_payments,375.900726,2024-01-01,1.0,0,95,7.916667,0.055202


In [7]:
from pathlib import Path
import shutil

# define the path relative to your notebook
data_mart_path = Path("data_mart/gold/label_store")

if data_mart_path.exists():
    shutil.rmtree(data_mart_path)
    print(f"✅ Cleared: {data_mart_path}")
    data_mart_path.mkdir(parents=True)  # recreate the empty folder
else:
    print(f"⚠️ Folder does not exist: {data_mart_path}")

✅ Cleared: data_mart/gold/label_store


In [6]:
from pathlib import Path

LABEL_PATH = Path("data_mart/gold/label_store")

for p in sorted(LABEL_PATH.glob("snapshot_date=*")):
    parquet_files = list(p.glob("*.parquet"))
    print(f"{p.name}: {len(parquet_files)} file(s)")

In [8]:
from pathlib import Path
from IPython.display import display
import pandas as pd

# ── Settings ─────────────────────────────────────────────────────────────
ROWS_TO_SHOW = 20
GOLD_ROOT = Path("data_mart", "gold", "feature_store")

FOLDERS = {
    "train": GOLD_ROOT / "train",
    "test": GOLD_ROOT / "test"
}

# ── Loop through train and test folders ──────────────────────────────────
for split, folder in FOLDERS.items():
    print(f"\n📂 {split.upper()} SET — from {folder}")

    if not folder.exists():
        print(f"⚠️  Folder not found: {folder}")
        continue

    # Pick first .parquet file alphabetically (e.g., gold_train_2023-07.parquet)
    parquet_files = sorted(folder.glob("*.parquet"))

    if not parquet_files:
        print("⚠️  No .parquet files found.")
        continue

    # Preview the first file
    file_path = parquet_files[0]
    df = pd.read_parquet(file_path)
    print(f"✓ Previewing: {file_path.name} — {df.shape[0]} rows")
    display(df.head(ROWS_TO_SHOW))



📂 TRAIN SET — from data_mart/gold/feature_store/train
✓ Previewing: gold_train_2023-01.parquet — 530 rows


Unnamed: 0,loan_id,Customer_ID,loan_start_date,tenure,installment_num,overdue_amt,balance,snapshot_date,mob,installments_missed,...,Num_of_Delayed_Payment_outlier_flag,payment_ratio,shortfall,full_payment,overpayment,missed_payment,rolling_avg_payment_ratio_3m,rolling_sum_shortfall_3m,rolling_max_dpd_3m,consecutive_missed_payments
0,CUS_0x1037_2023_01_01,cus_0x1037,2023-01-01,10,0,0.0,10000.0,2023-01-01,0,0,...,0,,0.0,1,0.0,0,,0.0,0.0,0
1,CUS_0x1069_2023_01_01,cus_0x1069,2023-01-01,10,0,0.0,10000.0,2023-01-01,0,0,...,0,,0.0,1,0.0,0,,0.0,0.0,0
2,CUS_0x114a_2023_01_01,cus_0x114a,2023-01-01,10,0,0.0,10000.0,2023-01-01,0,0,...,0,,0.0,1,0.0,0,,0.0,0.0,0
3,CUS_0x1184_2023_01_01,cus_0x1184,2023-01-01,10,0,0.0,10000.0,2023-01-01,0,0,...,0,,0.0,1,0.0,0,,0.0,0.0,0
4,CUS_0x1297_2023_01_01,cus_0x1297,2023-01-01,10,0,0.0,10000.0,2023-01-01,0,0,...,0,,0.0,1,0.0,0,,0.0,0.0,0
5,CUS_0x12fb_2023_01_01,cus_0x12fb,2023-01-01,10,0,0.0,10000.0,2023-01-01,0,0,...,0,,0.0,1,0.0,0,,0.0,0.0,0
6,CUS_0x1325_2023_01_01,cus_0x1325,2023-01-01,10,0,0.0,10000.0,2023-01-01,0,0,...,0,,0.0,1,0.0,0,,0.0,0.0,0
7,CUS_0x1341_2023_01_01,cus_0x1341,2023-01-01,10,0,0.0,10000.0,2023-01-01,0,0,...,0,,0.0,1,0.0,0,,0.0,0.0,0
8,CUS_0x1375_2023_01_01,cus_0x1375,2023-01-01,10,0,0.0,10000.0,2023-01-01,0,0,...,0,,0.0,1,0.0,0,,0.0,0.0,0
9,CUS_0x13a8_2023_01_01,cus_0x13a8,2023-01-01,10,0,0.0,10000.0,2023-01-01,0,0,...,0,,0.0,1,0.0,0,,0.0,0.0,0



📂 TEST SET — from data_mart/gold/feature_store/test
✓ Previewing: gold_test_2025-01.parquet — 5539 rows


Unnamed: 0,loan_id,Customer_ID,loan_start_date,tenure,installment_num,overdue_amt,balance,snapshot_date,mob,installments_missed,...,Num_of_Delayed_Payment_outlier_flag,payment_ratio,shortfall,full_payment,overpayment,missed_payment,rolling_avg_payment_ratio_3m,rolling_sum_shortfall_3m,rolling_max_dpd_3m,consecutive_missed_payments
0,CUS_0x1009_2025_01_01,cus_0x1009,2025-01-01,10,0,0.0,10000.0,2025-01-01,0,0,...,0,,0.0,1,0.0,0,,0.0,0.0,0
1,CUS_0x100b_2024_03_01,cus_0x100b,2024-03-01,10,10,0.0,0.0,2025-01-01,10,0,...,0,1.0,0.0,1,0.0,0,1.0,0.0,0.0,0
2,CUS_0x102e_2024_04_01,cus_0x102e,2024-04-01,10,9,7000.0,8000.0,2025-01-01,9,7,...,0,0.0,1000.0,0,0.0,1,0.0,3000.0,214.0,7
3,CUS_0x1038_2024_10_01,cus_0x1038,2024-10-01,10,3,0.0,7000.0,2025-01-01,3,0,...,0,1.0,0.0,1,0.0,0,1.0,0.0,0.0,0
4,CUS_0x103e_2024_12_01,cus_0x103e,2024-12-01,10,1,0.0,9000.0,2025-01-01,1,0,...,0,1.0,0.0,1,0.0,0,1.0,0.0,0.0,0
5,CUS_0x104f_2024_10_01,cus_0x104f,2024-10-01,10,3,0.0,7000.0,2025-01-01,3,0,...,0,1.0,0.0,1,0.0,0,1.0,0.0,0.0,0
6,CUS_0x1075_2024_05_01,cus_0x1075,2024-05-01,10,8,4000.0,6000.0,2025-01-01,8,4,...,0,0.0,1000.0,0,0.0,1,0.0,3000.0,122.0,4
7,CUS_0x107e_2024_11_01,cus_0x107e,2024-11-01,10,2,0.0,8000.0,2025-01-01,2,0,...,0,1.0,0.0,1,0.0,0,1.0,0.0,0.0,0
8,CUS_0x1087_2024_09_01,cus_0x1087,2024-09-01,10,4,0.0,6000.0,2025-01-01,4,0,...,0,1.0,0.0,1,0.0,0,1.0,0.0,0.0,0
9,CUS_0x1096_2024_03_01,cus_0x1096,2024-03-01,10,10,7000.0,7000.0,2025-01-01,10,7,...,0,0.0,1000.0,0,0.0,1,0.0,3000.0,214.0,7


In [9]:
from pathlib import Path
import pandas as pd

# ── Settings ─────────────────────────────────────────────────────────────
GOLD_ROOT = Path("data_mart", "gold", "feature_store")
FOLDERS = {
    "train": GOLD_ROOT / "train",
    "test": GOLD_ROOT / "test"
}

# ── Loop through train and test folders ──────────────────────────────────
for split, folder in FOLDERS.items():
    print(f"\n📂 {split.upper()} SET — {folder}")

    if not folder.exists():
        print(f"❌ Folder not found: {folder}")
        continue

    # Find one parquet file
    parquet_files = sorted(folder.glob("*.parquet"))
    if not parquet_files:
        print("⚠️  No .parquet files found.")
        continue

    file_path = parquet_files[0]
    df = pd.read_parquet(file_path)

    print(f"✓ Loaded {file_path.name} — {df.shape[0]} rows\n")
    print("🧬 Schema:")
    print(df.dtypes)

    print("\n📊 Summary stats (numeric only):")
    display(df.describe(include='number'))

    print("\n🔠 Sample rows:")
    display(df.head(5))



📂 TRAIN SET — data_mart/gold/feature_store/train
✓ Loaded gold_train_2023-01.parquet — 530 rows

🧬 Schema:
loan_id                          object
Customer_ID                      object
loan_start_date                  object
tenure                           object
installment_num                  object
                                 ...   
missed_payment                    int32
rolling_avg_payment_ratio_3m    float64
rolling_sum_shortfall_3m        float64
rolling_max_dpd_3m              float64
consecutive_missed_payments       int64
Length: 73, dtype: object

📊 Summary stats (numeric only):


Unnamed: 0,mob,installments_missed,dpd,fe_1,fe_2,fe_3,fe_4,fe_5,fe_6,fe_7,...,Num_of_Delayed_Payment_outlier_flag,payment_ratio,shortfall,full_payment,overpayment,missed_payment,rolling_avg_payment_ratio_3m,rolling_sum_shortfall_3m,rolling_max_dpd_3m,consecutive_missed_payments
count,530.0,530.0,530.0,530.0,530.0,530.0,530.0,530.0,530.0,530.0,...,530.0,0.0,530.0,530.0,530.0,530.0,0.0,530.0,530.0,530.0
mean,0.0,0.0,0.0,99.241508,103.083015,105.503777,95.013206,103.213211,91.416985,112.069809,...,0.003774,,0.0,1.0,0.0,0.0,,0.0,0.0,0.0
std,0.0,0.0,0.0,97.67733,100.502373,101.320229,102.762848,92.393005,101.188446,103.047997,...,0.061371,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
min,0.0,0.0,0.0,-191.0,-186.0,-181.0,-200.0,-191.0,-203.0,-236.0,...,0.0,,0.0,1.0,0.0,0.0,,0.0,0.0,0.0
25%,0.0,0.0,0.0,33.25,37.0,32.25,22.25,39.25,23.25,38.25,...,0.0,,0.0,1.0,0.0,0.0,,0.0,0.0,0.0
50%,0.0,0.0,0.0,102.0,103.5,102.0,91.0,101.0,91.5,109.0,...,0.0,,0.0,1.0,0.0,0.0,,0.0,0.0,0.0
75%,0.0,0.0,0.0,162.0,175.0,178.75,167.0,167.0,158.75,186.0,...,0.0,,0.0,1.0,0.0,0.0,,0.0,0.0,0.0
max,0.0,0.0,0.0,359.0,418.0,374.0,413.0,378.0,432.0,452.0,...,1.0,,0.0,1.0,0.0,0.0,,0.0,0.0,0.0



🔠 Sample rows:


Unnamed: 0,loan_id,Customer_ID,loan_start_date,tenure,installment_num,overdue_amt,balance,snapshot_date,mob,installments_missed,...,Num_of_Delayed_Payment_outlier_flag,payment_ratio,shortfall,full_payment,overpayment,missed_payment,rolling_avg_payment_ratio_3m,rolling_sum_shortfall_3m,rolling_max_dpd_3m,consecutive_missed_payments
0,CUS_0x1037_2023_01_01,cus_0x1037,2023-01-01,10,0,0.0,10000.0,2023-01-01,0,0,...,0,,0.0,1,0.0,0,,0.0,0.0,0
1,CUS_0x1069_2023_01_01,cus_0x1069,2023-01-01,10,0,0.0,10000.0,2023-01-01,0,0,...,0,,0.0,1,0.0,0,,0.0,0.0,0
2,CUS_0x114a_2023_01_01,cus_0x114a,2023-01-01,10,0,0.0,10000.0,2023-01-01,0,0,...,0,,0.0,1,0.0,0,,0.0,0.0,0
3,CUS_0x1184_2023_01_01,cus_0x1184,2023-01-01,10,0,0.0,10000.0,2023-01-01,0,0,...,0,,0.0,1,0.0,0,,0.0,0.0,0
4,CUS_0x1297_2023_01_01,cus_0x1297,2023-01-01,10,0,0.0,10000.0,2023-01-01,0,0,...,0,,0.0,1,0.0,0,,0.0,0.0,0



📂 TEST SET — data_mart/gold/feature_store/test
✓ Loaded gold_test_2025-01.parquet — 5539 rows

🧬 Schema:
loan_id                          object
Customer_ID                      object
loan_start_date                  object
tenure                           object
installment_num                  object
                                 ...   
missed_payment                    int32
rolling_avg_payment_ratio_3m    float64
rolling_sum_shortfall_3m        float64
rolling_max_dpd_3m              float64
consecutive_missed_payments       int64
Length: 73, dtype: object

📊 Summary stats (numeric only):


Unnamed: 0,mob,installments_missed,dpd,fe_1,fe_2,fe_3,fe_4,fe_5,fe_6,fe_7,...,Num_of_Delayed_Payment_outlier_flag,payment_ratio,shortfall,full_payment,overpayment,missed_payment,rolling_avg_payment_ratio_3m,rolling_sum_shortfall_3m,rolling_max_dpd_3m,consecutive_missed_payments
count,5539.0,5539.0,5539.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5539.0,5013.0,5539.0,5539.0,5539.0,5539.0,5013.0,5539.0,5539.0,5539.0
mean,5.003972,0.92291,28.248601,,,,,,,,...,0.004152,0.770995,218.450984,0.781549,11.193356,0.218451,0.796994,575.91623,28.74147,0.92291
std,3.179871,2.05346,62.833087,,,,,,,,...,0.064311,0.45445,413.232356,0.413232,125.55835,0.413232,0.387283,1114.177588,62.803556,2.05346
min,0.0,0.0,0.0,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,,,,,,,,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
50%,5.0,0.0,0.0,,,,,,,,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
75%,8.0,0.0,0.0,,,,,,,,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
max,10.0,10.0,306.0,,,,,,,,...,1.0,4.0,1000.0,1.0,3000.0,1.0,1.333333,3000.0,306.0,10.0



🔠 Sample rows:


Unnamed: 0,loan_id,Customer_ID,loan_start_date,tenure,installment_num,overdue_amt,balance,snapshot_date,mob,installments_missed,...,Num_of_Delayed_Payment_outlier_flag,payment_ratio,shortfall,full_payment,overpayment,missed_payment,rolling_avg_payment_ratio_3m,rolling_sum_shortfall_3m,rolling_max_dpd_3m,consecutive_missed_payments
0,CUS_0x1009_2025_01_01,cus_0x1009,2025-01-01,10,0,0.0,10000.0,2025-01-01,0,0,...,0,,0.0,1,0.0,0,,0.0,0.0,0
1,CUS_0x100b_2024_03_01,cus_0x100b,2024-03-01,10,10,0.0,0.0,2025-01-01,10,0,...,0,1.0,0.0,1,0.0,0,1.0,0.0,0.0,0
2,CUS_0x102e_2024_04_01,cus_0x102e,2024-04-01,10,9,7000.0,8000.0,2025-01-01,9,7,...,0,0.0,1000.0,0,0.0,1,0.0,3000.0,214.0,7
3,CUS_0x1038_2024_10_01,cus_0x1038,2024-10-01,10,3,0.0,7000.0,2025-01-01,3,0,...,0,1.0,0.0,1,0.0,0,1.0,0.0,0.0,0
4,CUS_0x103e_2024_12_01,cus_0x103e,2024-12-01,10,1,0.0,9000.0,2025-01-01,1,0,...,0,1.0,0.0,1,0.0,0,1.0,0.0,0.0,0


In [10]:
# Assume `df` is your loaded DataFrame from parquet
for col, dtype in df.dtypes.items():
    print(f"{col:<35} {dtype}")

loan_id                             object
Customer_ID                         object
loan_start_date                     object
tenure                              object
installment_num                     object
overdue_amt                         object
balance                             object
snapshot_date                       datetime64[ns]
mob                                 int32
installments_missed                 int32
first_missed_date                   object
dpd                                 int32
fe_1                                float32
fe_2                                float32
fe_3                                float32
fe_4                                float32
fe_5                                float32
fe_6                                float32
fe_7                                float32
fe_8                                float32
fe_9                                float32
fe_10                               float32
fe_11                               flo