In [None]:
"""
1.  Load the dataset:

        import pandas as pd
        from src.eda_borrower import BorrowerProfileEDA, run_borrower_eda_pipeline

        df = pd.read_csv("data/loan_sample.csv")  # or the correct path in your repo


2.  Instantiate the EDA class:

        eda = BorrowerProfileEDA(df, target_col="loan_status")


3.  Run the pipeline and inspect the results:

        report = run_borrower_eda_pipeline(eda)


4.  Display at least:

        report["structure"]          # table of borrower column structure
        report["income"]             # income stats
        report["freqs"]              # categorical frequencies
        report["default_by_home_ownership"]  # default rate by home_ownership
        report["default_by_purpose"]         # default rate by purpose



You can add markdown cells explaining what each result means in plain language (e.g., class imbalance, missingness, etc.).

Acceptance Criteria ✅
---------------------

-   `BorrowerProfileEDA`:

    -   Initializes correctly with a DataFrame.
    -   `structure_summary()` returns a DataFrame with the requested columns/metrics.
    -   `income_summary()` returns a DataFrame with stats for `annual_inc` and `annual_inc_joint`.
    -   `categorical_freqs()` returns a dict of Series with top categories.
    -   `default_rate_by_category(col)` returns a Series of default rates per category.
-   Functional pipeline:

    -   `borrower_eda_steps(eda)` returns a dict of callables.
    -   `run_borrower_eda_pipeline(eda)` iterates over that dict, calls each function, and returns a dict of results.
-   Notebook:

    -   Runs top-to-bottom without errors.
    -   Shows the structure summary, income summary, categorical frequencies, and default-rate-by-category analysis.
    -   Contains only EDA (no model training).
    """

In [12]:
import os

if not os.path.exists("../src/eda_borrower.py"):
    raise FileNotFoundError("⚠️ eda_borrower.py missing in src/. Check your repo structure!")


In [1]:
import sys
import os

repo_root = r"/Users/dv/Documents/cloned_repos/ml-model-git-lab"
sys.path.insert(0, repo_root)

In [2]:
import importlib
import src.eda_borrower
importlib.reload(src.eda_borrower)

<module 'src.eda_borrower' from '/Users/dv/Documents/cloned_repos/ml-model-git-lab/src/eda_borrower.py'>

In [3]:
import pandas as pd
import numpy as np
from src.eda_borrower import BorrowerProfileEDA, run_borrower_eda_pipeline

In [4]:
print(src.eda_borrower.__file__)

/Users/dv/Documents/cloned_repos/ml-model-git-lab/src/eda_borrower.py


In [5]:
import pandas as pd
from src.eda_borrower import BorrowerProfileEDA, run_borrower_eda_pipeline
df = pd.read_csv("/Users/dv/Documents/cloned_repos/ml-model-git-lab/notebooks/data/loan.csv")

  df = pd.read_csv("/Users/dv/Documents/cloned_repos/ml-model-git-lab/notebooks/data/loan.csv")


In [13]:
default_map = {
    "Fully Paid": 0,
    "Current": 0,
    "In Grace Period": 0,
    "Issued": 0,
    "Does not meet the credit policy. Status:Fully Paid": 0,

    "Charged Off": 1,
    "Default": 1,
    "Late (31-120 days)": 1,
    "Late (16-30 days)": 1,
    "Does not meet the credit policy. Status:Charged Off": 1
}

df["loan_status_binary"] = df["loan_status"].map(default_map)


In [14]:
eda = BorrowerProfileEDA(df, target_col="loan_status_binary")


In [16]:
report = run_borrower_eda_pipeline(eda)

In [17]:
report["structure"]

Unnamed: 0,column,dtype,n_missing,missing_pct,n_unique
0,id,int64,0,0.0,887379
1,member_id,int64,0,0.0,887379
2,emp_title,object,51462,5.799326,299271
3,emp_length,object,44825,5.051393,11
4,home_ownership,object,0,0.0,6
5,annual_inc,float64,4,0.000451,49384
6,annual_inc_joint,float64,886868,99.942415,308
7,verification_status,object,0,0.0,3
8,verification_status_joint,object,886868,99.942415,3
9,zip_code,object,0,0.0,935


In [18]:
report["income"]

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
annual_inc,887375.0,75027.587761,64698.300142,0.0,45000.0,65000.0,90000.0,9500000.0
annual_inc_joint,511.0,109981.011585,52730.379847,17950.0,76032.5,101771.0,132800.0,500000.0


In [19]:
report["freqs"]

{'home_ownership': home_ownership
 MORTGAGE    443557
 RENT        356117
 OWN          87470
 OTHER          182
 NONE            50
 ANY              3
 Name: count, dtype: int64,
 'addr_state': addr_state
 CA    129517
 NY     74086
 TX     71138
 FL     60935
 IL     35476
 NJ     33256
 PA     31393
 OH     29631
 GA     29085
 VA     26255
 Name: count, dtype: int64,
 'purpose': purpose
 debt_consolidation    524215
 credit_card           206182
 home_improvement       51829
 other                  42894
 major_purchase         17277
 small_business         10377
 car                     8863
 medical                 8540
 moving                  5414
 vacation                4736
 Name: count, dtype: int64}

In [20]:
report["default_by_home_ownership"]

home_ownership
ANY         0.000000
MORTGAGE    0.060520
NONE        0.160000
OTHER       0.208791
OWN         0.064662
RENT        0.080395
Name: loan_status_binary, dtype: float64

In [21]:
report["default_by_purpose"]

purpose
car                   0.062733
credit_card           0.051435
debt_consolidation    0.071745
educational           0.208038
home_improvement      0.061471
house                 0.102509
major_purchase        0.067662
medical               0.087588
moving                0.104174
other                 0.089826
renewable_energy      0.111304
small_business        0.164017
vacation              0.077069
wedding               0.121858
Name: loan_status_binary, dtype: float64