### 1: Load Clean Data & Imports

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport 
import warnings

# --- Setup ---
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

print("Libraries imported.")

# --- Load Processed Data ---

ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
PROCESSED_DATA_DIR = os.path.join(ROOT_DIR, 'data', 'processed')

X_path = os.path.join(PROCESSED_DATA_DIR, 'X_cleaned.csv')
y_path = os.path.join(PROCESSED_DATA_DIR, 'y_target.csv')

print(f"Loading features from: {X_path}")
print(f"Loading target from: {y_path}")

try:
    # Load features, ensuring PROSPECTID is our index
    X = pd.read_csv(X_path, index_col='PROSPECTID')
    
    # Load target, set index, and use .squeeze() to turn it into a Series
    y = pd.read_csv(y_path, index_col='PROSPECTID').squeeze('columns')
    
    print("\n--- Data Loaded Successfully ---")
    
    # --- Verification Step ---
    print("\n--- X (Features) Info ---")
    X.info()
    
    print(f"\n--- y (Target) Imbalance (Finding 4) ---")
    print(y.value_counts(normalize=True).sort_index())

except FileNotFoundError as e:
    print(f"\n[ERROR] Files not found. Did you complete Phase 1?")
    print(e)

Libraries imported.
Loading features from: /Users/yogeshdhaliya/Desktop/DS Learning/11. Projects/Credit-Risk-Prediction/data/processed/X_cleaned.csv
Loading target from: /Users/yogeshdhaliya/Desktop/DS Learning/11. Projects/Credit-Risk-Prediction/data/processed/y_target.csv

--- Data Loaded Successfully ---

--- X (Features) Info ---
<class 'pandas.core.frame.DataFrame'>
Index: 51336 entries, 1 to 51336
Data columns (total 58 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tot_enq                     45015 non-null  float64
 1   CC_enq                      45015 non-null  float64
 2   CC_enq_L6m                  45015 non-null  float64
 3   CC_enq_L12m                 45015 non-null  float64
 4   PL_enq                      45015 non-null  float64
 5   PL_enq_L6m                  45015 non-null  float64
 6   PL_enq_L12m                 45015 non-null  float64
 7   time_since_recent_enq       45015 non-null

### 2: Full Data Profile

In [2]:
# --- 1. Combine X and y for Profiling ---
# The profiler works best when it can see features and target together
df_for_profiling = X.copy()
df_for_profiling[y.name] = y

print(f"Combined dataframe for profiling. Shape: {df_for_profiling.shape}")

# --- 2. Generate the Profile Report ---
profile = ProfileReport(
    df_for_profiling, 
    title="Credit Risk Data Profile (Phase 2)",
    explorative=True
)

# --- 3. Save Report to File ---
# We save to an HTML file in the *current* /notebooks folder.
# This avoids flooding the notebook with a huge output.
profile_path = os.path.join(os.getcwd(), 'credit_risk_profile.html')
profile.to_file(profile_path)

print(f"\n--- SUCCESS! ---")
print(f"Full data profile has been generated and saved to:")
print(f"{profile_path}")
print("\nPlease open this file in your browser to explore the data.")

Combined dataframe for profiling. Shape: (51336, 59)


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]


--- SUCCESS! ---
Full data profile has been generated and saved to:
/Users/yogeshdhaliya/Desktop/DS Learning/11. Projects/Credit-Risk-Prediction/notebooks/credit_risk_profile.html

Please open this file in your browser to explore the data.


In [3]:
df_for_profiling.head()

Unnamed: 0_level_0,tot_enq,CC_enq,CC_enq_L6m,CC_enq_L12m,PL_enq,PL_enq_L6m,PL_enq_L12m,time_since_recent_enq,enq_L12m,enq_L6m,enq_L3m,MARITALSTATUS,EDUCATION,AGE,GENDER,NETMONTHLYINCOME,Time_With_Curr_Empr,pct_of_active_TLs_ever,pct_opened_TLs_L6m_of_L12m,pct_currentBal_all_TL,CC_utilization,CC_Flag,PL_utilization,PL_Flag,pct_PL_enq_L6m_of_L12m,pct_CC_enq_L6m_of_L12m,pct_PL_enq_L6m_of_ever,pct_CC_enq_L6m_of_ever,max_unsec_exposure_inPct,HL_Flag,GL_Flag,last_prod_enq2,first_prod_enq2,Total_TL,Tot_Closed_TL,Tot_Active_TL,Total_TL_opened_L6M,Tot_TL_closed_L6M,pct_tl_open_L6M,pct_tl_closed_L6M,pct_active_tl,pct_closed_tl,Total_TL_opened_L12M,Tot_TL_closed_L12M,pct_tl_open_L12M,pct_tl_closed_L12M,Tot_Missed_Pmnt,Auto_TL,CC_TL,Consumer_TL,Gold_TL,Home_TL,PL_TL,Secured_TL,Unsecured_TL,Other_TL,Age_Oldest_TL,Age_Newest_TL,Approved_Flag
PROSPECTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1
1,6.0,0.0,0.0,0.0,6.0,0.0,0.0,566.0,0.0,0.0,0.0,Married,12TH,48,M,51000,114,0.2,0.0,0.798,,0,0.798,1,0.0,0.0,0.0,0.0,13.333,1,0,PL,PL,5,4,1,0,0,0.0,0.0,0.2,0.8,0,0,0.0,0.0,0,0,0,0,1,0,4,1,4,0,72.0,18.0,P2
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,209.0,1.0,0.0,0.0,Single,GRADUATE,23,F,19000,50,1.0,0.0,0.37,,0,,0,0.0,0.0,0.0,0.0,0.86,0,0,ConsumerLoan,ConsumerLoan,1,0,1,0,0,0.0,0.0,1.0,0.0,1,0,1.0,0.0,0,0,0,1,0,0,0,0,1,0,7.0,7.0,P2
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,587.0,0.0,0.0,0.0,Married,SSC,40,M,18,191,1.0,0.5,0.585,,0,,0,0.0,0.0,0.0,0.0,5741.667,1,0,ConsumerLoan,others,8,0,8,1,0,0.125,0.0,1.0,0.0,2,0,0.25,0.0,1,1,0,6,1,0,0,2,6,0,47.0,2.0,P2
4,,,,,,,,,,,,Married,SSC,34,M,10000,246,1.0,1.0,0.99,,0,,0,0.0,0.0,0.0,0.0,9.9,0,0,others,others,1,0,1,1,0,1.0,0.0,1.0,0.0,1,0,1.0,0.0,1,0,0,0,0,0,0,0,1,1,5.0,5.0,P2
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3951.0,0.0,0.0,0.0,Married,POST-GRADUATE,48,M,15000,75,0.333,0.0,0.0,,0,,0,0.0,0.0,0.0,0.0,,0,0,AL,AL,3,2,1,0,0,0.0,0.0,0.333,0.667,0,0,0.0,0.0,0,1,0,0,0,0,0,3,0,2,131.0,32.0,P1
