### 1: Load Clean Data & Imports

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport 
import warnings

# --- Setup ---
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

print("Libraries imported.")

# --- Load Processed Data ---

ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
PROCESSED_DATA_DIR = os.path.join(ROOT_DIR, 'data', 'processed')

X_path = os.path.join(PROCESSED_DATA_DIR, 'X_cleaned.csv')
y_path = os.path.join(PROCESSED_DATA_DIR, 'y_target.csv')

print(f"Loading features from: {X_path}")
print(f"Loading target from: {y_path}")

try:
    # Load features, ensuring PROSPECTID is our index
    X = pd.read_csv(X_path, index_col='PROSPECTID')
    
    # Load target, set index, and use .squeeze() to turn it into a Series
    y = pd.read_csv(y_path, index_col='PROSPECTID').squeeze('columns')
    
    print("\n--- Data Loaded Successfully ---")
    
    # --- Verification Step ---
    print("\n--- X (Features) Info ---")
    X.info()
    
    print(f"\n--- y (Target) Imbalance (Finding 4) ---")
    print(y.value_counts(normalize=True).sort_index())

except FileNotFoundError as e:
    print(f"\n[ERROR] Files not found. Did you complete Phase 1?")
    print(e)

Libraries imported.
Loading features from: /Users/yogeshdhaliya/Desktop/DS Learning/11. Projects/Credit-Risk-Prediction/data/processed/X_cleaned.csv
Loading target from: /Users/yogeshdhaliya/Desktop/DS Learning/11. Projects/Credit-Risk-Prediction/data/processed/y_target.csv

--- Data Loaded Successfully ---

--- X (Features) Info ---
<class 'pandas.core.frame.DataFrame'>
Index: 51336 entries, 1 to 51336
Data columns (total 58 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tot_enq                     45015 non-null  float64
 1   CC_enq                      45015 non-null  float64
 2   CC_enq_L6m                  45015 non-null  float64
 3   CC_enq_L12m                 45015 non-null  float64
 4   PL_enq                      45015 non-null  float64
 5   PL_enq_L6m                  45015 non-null  float64
 6   PL_enq_L12m                 45015 non-null  float64
 7   time_since_recent_enq       45015 non-null

### 2: Full Data Profile

In [3]:
# --- 1. Combine X and y for Profiling ---
# The profiler works best when it can see features and target together
df_for_profiling = X.copy()
df_for_profiling[y.name] = y

print(f"Combined dataframe for profiling. Shape: {df_for_profiling.shape}")

# --- 2. Generate the Profile Report ---
profile = ProfileReport(
    df_for_profiling, 
    title="Credit Risk Data Profile (Phase 2)",
    explorative=True
)

# --- 3. Save Report to File ---
# We save to an HTML file in the *current* /notebooks folder.
# This avoids flooding the notebook with a huge output.
profile_path = os.path.join(os.getcwd(), 'credit_risk_profile.html')
profile.to_file(profile_path)

print(f"\n--- SUCCESS! ---")
print(f"Full data profile has been generated and saved to:")
print(f"{profile_path}")
print("\nPlease open this file in your browser to explore the data.")

Combined dataframe for profiling. Shape: (51336, 59)


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Exception in thread Thread-5:
Traceback (most recent call last):
  File "/Users/yogeshdhaliya/anaconda3/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/Users/yogeshdhaliya/Desktop/DS Learning/11. Projects/Credit-Risk-Prediction/venv/lib/python3.11/site-packages/tqdm/_monitor.py", line 84, in run
    instance.refresh(nolock=True)
  File "/Users/yogeshdhaliya/Desktop/DS Learning/11. Projects/Credit-Risk-Prediction/venv/lib/python3.11/site-packages/tqdm/std.py", line 1347, in refresh
    self.display()
  File "/Users/yogeshdhaliya/Desktop/DS Learning/11. Projects/Credit-Risk-Prediction/venv/lib/python3.11/site-packages/tqdm/notebook.py", line 171, in display
    rtext.value = right
    ^^^^^^^^^^^
  File "/Users/yogeshdhaliya/Desktop/DS Learning/11. Projects/Credit-Risk-Prediction/venv/lib/python3.11/site-packages/traitlets/traitlets.py", line 716, in __set__
    self.set(obj, value)
  File "/Users/yogeshdhaliya/Desktop/DS Learning/11. Projects/Credit-

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]


--- SUCCESS! ---
Full data profile has been generated and saved to:
/Users/yogeshdhaliya/Desktop/DS Learning/11. Projects/Credit-Risk-Prediction/notebooks/credit_risk_profile.html

Please open this file in your browser to explore the data.


### 3: Categorical Feature Analysis

In [5]:
# --- 1. Define our column lists based on X.info() ---
categorical_cols = [
    'MARITALSTATUS', 
    'EDUCATION', 
    'GENDER', 
    'last_prod_enq2', 
    'first_prod_enq2'
]

# Numerical cols are everything else
numerical_cols = [col for col in X.columns if col not in categorical_cols]

print(f"Identified {len(categorical_cols)} categorical columns.")
print(f"Identified {len(numerical_cols)} numerical columns.")

# --- 2. Validate Finding 3: Ordinal vs. Nominal ---

# First, inspect 'EDUCATION' to confirm its ordinal nature
print("\n--- Finding 3: Ordinal Feature Check ('EDUCATION') ---")
print(X['EDUCATION'].value_counts())
print("=> CONFIRMED: 'EDUCATION' has a clear order (SSC/OTHERS < 12TH < GRADUATE < POST GRADUATE < PROFESSIONAL). This is ordinal.")

# Next, inspect the remaining nominal features
print("\n--- Nominal Feature Check (Others) ---")
for col in categorical_cols:
    if col != 'EDUCATION':
        print(f"\n--- Feature: '{col}' ---")
        print(X[col].value_counts())
        print(f"=> CONFIRMED: '{col}' has no inherent order. This is nominal.")

Identified 5 categorical columns.
Identified 53 numerical columns.

--- Finding 3: Ordinal Feature Check ('EDUCATION') ---
EDUCATION
GRADUATE          16673
12TH              14467
SSC                9276
UNDER GRADUATE     5492
OTHERS             2917
POST-GRADUATE      2242
PROFESSIONAL        269
Name: count, dtype: int64
=> CONFIRMED: 'EDUCATION' has a clear order (OTHERS < 12TH < SSC < GRADUATE < PROFESSIONAL). This is ordinal.

--- Nominal Feature Check (Others) ---

--- Feature: 'MARITALSTATUS' ---
MARITALSTATUS
Married    37752
Single     13584
Name: count, dtype: int64
=> CONFIRMED: 'MARITALSTATUS' has no inherent order. This is nominal.

--- Feature: 'GENDER' ---
GENDER
M    45245
F     6091
Name: count, dtype: int64
=> CONFIRMED: 'GENDER' has no inherent order. This is nominal.

--- Feature: 'last_prod_enq2' ---
last_prod_enq2
others          20831
ConsumerLoan    17793
PL               7959
CC               2339
AL               1511
HL                903
Name: count, dtype

### 4: Statistical Test (Categorical vs. Target)

In [6]:
from scipy.stats import chi2_contingency

print("\n--- Chi-Square Test (Categorical vs. Target) ---")
print("Null Hypothesis: Feature is independent of the Target (Approved_Flag)")
print("We are looking for a p-value < 0.05 to reject the null.\n")

chi2_results = []
for col in categorical_cols:
    # Create a contingency table (crosstab)
    contingency_table = pd.crosstab(X[col], y)
    
    # Run the test
    chi2_stat, p_value, _, _ = chi2_contingency(contingency_table)
    
    chi2_results.append({
        'Feature': col,
        'Chi2 Statistic': chi2_stat,
        'p-value': p_value
    })

# Display results
df_chi2 = pd.DataFrame(chi2_results).sort_values(by='p-value', ascending=True)
print(df_chi2)
print("\n=> Interpretation: Features with very low p-values are statistically significant predictors.")


--- Chi-Square Test (Categorical vs. Target) ---
Null Hypothesis: Feature is independent of the Target (Approved_Flag)
We are looking for a p-value < 0.05 to reject the null.

           Feature  Chi2 Statistic        p-value
3   last_prod_enq2     3728.514629   0.000000e+00
4  first_prod_enq2     1833.116465   0.000000e+00
0    MARITALSTATUS     1188.129358  2.758830e-257
1        EDUCATION      225.246136   8.464676e-38
2           GENDER       19.229726   2.450668e-04

=> Interpretation: Features with very low p-values are statistically significant predictors.


### 5: Numerical Feature Analysis (Stats & Distribution)

In [7]:
# --- 1. Descriptive Statistics ---
print("\n--- Numerical Features: Descriptive Statistics ---")
# .T transposes the output to be more readable
print(X[numerical_cols].describe().T)

# --- 2. Visualize Key Distributions ---
print("\n--- Visualizing Key Numerical Distributions ---")

# Plot AGE
plt.figure(figsize=(12, 5))
sns.kdeplot(data=X, x='AGE', hue=y, common_norm=False, fill=True)
plt.title('Distribution of AGE by Risk Category', fontsize=16)
plt.xlabel('Age')
plt.show()

# Plot NETMONTHLYINCOME
# This will be very skewed, so we'll plot its log-transform
plt.figure(figsize=(12, 5))
# Use np.log1p(x) which is log(1+x) to handle any zero incomes
sns.kdeplot(data=X, x=np.log1p(X['NETMONTHLYINCOME']), hue=y, common_norm=False, fill=True)
plt.title('Distribution of Log(NETMONTHLYINCOME) by Risk Category', fontsize=16)
plt.xlabel('Log(Net Monthly Income)')
plt.show()


--- Numerical Features: Descriptive Statistics ---
                              count          mean           std   min  \
tot_enq                     45015.0      5.291036      6.178414   1.0   
CC_enq                      45015.0      0.467266      1.666951   0.0   
CC_enq_L6m                  45015.0      0.158903      0.673070   0.0   
CC_enq_L12m                 45015.0      0.260846      0.998968   0.0   
PL_enq                      45015.0      1.135177      2.330629   0.0   
PL_enq_L6m                  45015.0      0.506720      1.353613   0.0   
PL_enq_L12m                 45015.0      0.757659      1.769592   0.0   
time_since_recent_enq       45015.0    260.051649    462.042665   0.0   
enq_L12m                    45015.0      3.011107      4.218634   0.0   
enq_L6m                     45015.0      1.995290      3.121818   0.0   
enq_L3m                     45015.0      1.241719      2.063165   0.0   
AGE                         51336.0     33.758532      8.816364  21.0   

### 6: Statistical Test (Numerical vs. Target)

In [8]:
from scipy.stats import f_oneway

print("\n--- ANOVA F-Test (Numerical vs. Target) ---")
print("Null Hypothesis: The mean of the feature is the same for all target classes (P1-P4)")
print("We are looking for a p-value < 0.05 to reject the null.\n")

anova_results = []
for col in numerical_cols:
    # ANOVA cannot handle NaN values, so we must drop them *for the test*
    # We create a list of Series, one for each target class
    groups = [
        X[y == target_class][col].dropna() 
        for target_class in y.unique()
    ]
    
    # Check if we have data after dropping NaNs
    if all(len(g) > 0 for g in groups):
        f_stat, p_value = f_oneway(*groups)
        anova_results.append({
            'Feature': col,
            'F-Statistic': f_stat,
            'p-value': p_value
        })
    else:
        anova_results.append({
            'Feature': col,
            'F-Statistic': np.nan,
            'p-value': np.nan  # Not enough data to test
        })

# Display results
df_anova = pd.DataFrame(anova_results).sort_values(by='p-value', ascending=True)
print(df_anova)
print("\n=> Interpretation: Features with very low p-values have means that are")
print("   statistically different across the P1-P4 classes.")


--- ANOVA F-Test (Numerical vs. Target) ---
Null Hypothesis: The mean of the feature is the same for all target classes (P1-P4)
We are looking for a p-value < 0.05 to reject the null.

                       Feature  F-Statistic        p-value
0                      tot_enq  1690.485944   0.000000e+00
28                    Total_TL  1190.662645   0.000000e+00
24      pct_CC_enq_L6m_of_ever   770.204842   0.000000e+00
23      pct_PL_enq_L6m_of_ever  3672.838162   0.000000e+00
22      pct_CC_enq_L6m_of_L12m   777.634453   0.000000e+00
21      pct_PL_enq_L6m_of_L12m  3530.225183   0.000000e+00
29               Tot_Closed_TL  1019.098298   0.000000e+00
30               Tot_Active_TL   649.567886   0.000000e+00
33             pct_tl_open_L6M   941.447777   0.000000e+00
35               pct_active_tl   513.974136   0.000000e+00
36               pct_closed_tl   513.974136   0.000000e+00
39            pct_tl_open_L12M   967.962159   0.000000e+00
14      pct_of_active_TLs_ever   513.974136   0

### 7: Manual Missing Data Analysis

In [9]:
print("\n--- Missing Data Analysis ---")
missing_pct = X.isna().mean() * 100
missing_pct_sorted = missing_pct[missing_pct > 0].sort_values(ascending=False)

print("Percentage of missing values per feature (showing only > 0%):")
print(missing_pct_sorted)

print("\n--- Missing Data Groups Identified ---")
print(f"Major Missingness (> 80%): {missing_pct_sorted[missing_pct_sorted > 80].index.tolist()}")
print(f"Moderate Missingness (20-80%): {missing_pct_sorted[(missing_pct_sorted > 20) & (missing_pct_sorted < 80)].index.tolist()}")
print(f"Systematic Missingness (~12%): {missing_pct_sorted[(missing_pct_sorted > 10) & (missing_pct_sorted < 20)].index.tolist()}")


--- Missing Data Analysis ---
Percentage of missing values per feature (showing only > 0%):
CC_utilization              92.792582
PL_utilization              86.557192
max_unsec_exposure_inPct    45.149603
tot_enq                     12.312997
CC_enq                      12.312997
enq_L3m                     12.312997
enq_L6m                     12.312997
enq_L12m                    12.312997
time_since_recent_enq       12.312997
PL_enq_L12m                 12.312997
PL_enq_L6m                  12.312997
PL_enq                      12.312997
CC_enq_L12m                 12.312997
CC_enq_L6m                  12.312997
pct_currentBal_all_TL        0.140252
Age_Oldest_TL                0.077918
Age_Newest_TL                0.077918
dtype: float64

--- Missing Data Groups Identified ---
Major Missingness (> 80%): ['CC_utilization', 'PL_utilization']
Moderate Missingness (20-80%): ['max_unsec_exposure_inPct']
Systematic Missingness (~12%): ['tot_enq', 'CC_enq', 'enq_L3m', 'enq_L6m', 'enq_L1

### 8: Feature Pruning

In [10]:
# --- 1. Define Columns to Drop (Based on Phase 2 EDA) ---
cols_to_drop = [
    # Group 1: Major Missingness (>80%)
    'CC_utilization',
    'PL_utilization',
    
    # Group 2: Not Statistically Significant (High p-value)
    'max_unsec_exposure_inPct',
    'pct_currentBal_all_TL'
]

print(f"Original X shape: {X.shape}")
print(f"Dropping {len(cols_to_drop)} columns identified in EDA...")

# --- 2. Drop the Columns ---
X_pruned = X.drop(columns=cols_to_drop)

print(f"New pruned X shape: {X_pruned.shape}")

# --- 3. Overwrite Processed Files (Our New Checkpoint) ---
# We are overwriting our "cleaned" file with this new,
# even cleaner, pruned version.
X_pruned_path = os.path.join(PROCESSED_DATA_DIR, 'X_cleaned.csv')
y_path = os.path.join(PROCESSED_DATA_DIR, 'y_target.csv')

X_pruned.to_csv(X_pruned_path)
y.to_csv(y_path) # Re-save y as well to keep them paired

print(f"\n--- PHASE 2 COMPLETE ---")
print(f"Pruned X (54 features) saved to: {X_pruned_path}")
print(f"Target y saved to: {y_path}")

Original X shape: (51336, 58)
Dropping 4 columns identified in EDA...
New pruned X shape: (51336, 54)

--- PHASE 2 COMPLETE ---
Pruned X (54 features) saved to: /Users/yogeshdhaliya/Desktop/DS Learning/11. Projects/Credit-Risk-Prediction/data/processed/X_cleaned.csv
Target y saved to: /Users/yogeshdhaliya/Desktop/DS Learning/11. Projects/Credit-Risk-Prediction/data/processed/y_target.csv


**Phase:2 EDA**.

Our goal was to deeply analyze the leak-free data, validate our project's Key Findings, and build a clear plan for preprocessing.

1.  **Categorical Feature Analysis (Finding 3):**
    * We manually inspected all 5 `object` columns (`EDUCATION`, `MARITALSTATUS`, `GENDER`, etc.).
    * We validated `EDUCATION` has a clear ordinal structure (which we will custom-map) and the rest are nominal (which we will one-hot encode).
    * A **Chi-Square test** proved that all 5 categorical features are **statistically significant** predictors of the target, so we will keep all of them.

2.  **Numerical Feature Analysis:**
    * We analyzed all 53 numerical features and found **massive outliers** (e.g., in `NETMONTHLYINCOME`). This confirms our decision to use `RobustScaler` (which is not sensitive to outliers) in our pipeline.
    * An **ANOVA test** showed that almost all numerical features are **statistically significant**.

3.  **Missing Data Strategy & Feature Pruning (Our Main Action):**
    * We identified three distinct groups of missing data and created a clear strategy:
    * **DROP (Major Missingness):** We dropped `CC_utilization` and `PL_utilization` because they were over 86% missing and unusable.
    * **DROP (Not Significant):** We dropped `max_unsec_exposure_inPct` and `pct_currentBal_all_TL` because our ANOVA test showed they had **no statistical relationship** with the target (high p-values).
    * **IMPUTE PLAN (for Phase 3):** We confirmed the 11 "enquiry" columns should be imputed with `0` (as `NaN` means "zero enquiries") and 2 minor "Age" columns will be imputed with their `median`.

**Final Outcome:** We concluded Phase 2 by saving a new, pruned `X_cleaned.csv` file with 54 high-quality features, ready for **Phase 3: Feature Engineering**.