### 1: Load Clean Data & Imports

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport 
import warnings

# --- Setup ---
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

print("Libraries imported.")

# --- Load Processed Data ---

ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
PROCESSED_DATA_DIR = os.path.join(ROOT_DIR, 'data', 'processed')

X_path = os.path.join(PROCESSED_DATA_DIR, 'X_cleaned.csv')
y_path = os.path.join(PROCESSED_DATA_DIR, 'y_target.csv')

print(f"Loading features from: {X_path}")
print(f"Loading target from: {y_path}")

try:
    # Load features, ensuring PROSPECTID is our index
    X = pd.read_csv(X_path, index_col='PROSPECTID')
    
    # Load target, set index, and use .squeeze() to turn it into a Series
    y = pd.read_csv(y_path, index_col='PROSPECTID').squeeze('columns')
    
    print("\n--- Data Loaded Successfully ---")
    
    # --- Verification Step ---
    print("\n--- X (Features) Info ---")
    X.info()
    
    print(f"\n--- y (Target) Imbalance (Finding 4) ---")
    print(y.value_counts(normalize=True).sort_index())

except FileNotFoundError as e:
    print(f"\n[ERROR] Files not found. Did you complete Phase 1?")
    print(e)

Libraries imported.
Loading features from: /Users/yogeshdhaliya/Desktop/DS Learning/11. Projects/Credit-Risk-Prediction/data/processed/X_cleaned.csv
Loading target from: /Users/yogeshdhaliya/Desktop/DS Learning/11. Projects/Credit-Risk-Prediction/data/processed/y_target.csv

--- Data Loaded Successfully ---

--- X (Features) Info ---
<class 'pandas.core.frame.DataFrame'>
Index: 51336 entries, 1 to 51336
Data columns (total 58 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tot_enq                     45015 non-null  float64
 1   CC_enq                      45015 non-null  float64
 2   CC_enq_L6m                  45015 non-null  float64
 3   CC_enq_L12m                 45015 non-null  float64
 4   PL_enq                      45015 non-null  float64
 5   PL_enq_L6m                  45015 non-null  float64
 6   PL_enq_L12m                 45015 non-null  float64
 7   time_since_recent_enq       45015 non-null

### 2: Full Data Profile

In [2]:
# --- 1. Combine X and y for Profiling ---
# The profiler works best when it can see features and target together
df_for_profiling = X.copy()
df_for_profiling[y.name] = y

print(f"Combined dataframe for profiling. Shape: {df_for_profiling.shape}")

# --- 2. Generate the Profile Report ---
profile = ProfileReport(
    df_for_profiling, 
    title="Credit Risk Data Profile (Phase 2)",
    explorative=True
)

# --- 3. Save Report to File ---
# We save to an HTML file in the *current* /notebooks folder.
# This avoids flooding the notebook with a huge output.
profile_path = os.path.join(os.getcwd(), 'credit_risk_profile.html')
profile.to_file(profile_path)

print(f"\n--- SUCCESS! ---")
print(f"Full data profile has been generated and saved to:")
print(f"{profile_path}")
print("\nPlease open this file in your browser to explore the data.")

Combined dataframe for profiling. Shape: (51336, 59)


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]


--- SUCCESS! ---
Full data profile has been generated and saved to:
/Users/yogeshdhaliya/Desktop/DS Learning/11. Projects/Credit-Risk-Prediction/notebooks/credit_risk_profile.html

Please open this file in your browser to explore the data.
