In [1]:
# ==============================================================================
# PROJECT SETUP & LIBRARY IMPORT
# ==============================================================================
# This cell adds the project root to the Python path to ensure that the
# `risk_governance_kit` library can be imported correctly.

import os
import sys

# Get the project root directory (which is one level up from the notebooks folder)
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# Add the project root to the system path if it's not already there
if project_root not in sys.path:
    sys.path.insert(0, project_root)
    print(f"✅ Project root added to path: {project_root}")

# Now that the path is set, we can import our library
from risk_governance_kit import data_profiling_report

✅ Project root added to path: c:\Users\Christina\Dropbox\PERSONAL\YUVARAJ\PROJECTS\credit_risk_major


In [2]:
# ===================================================================================
# SECTION 1: SETUP AND DATA LOADING
# ===================================================================================

# Core libraries

import os
import openpyxl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib.ticker import PercentFormatter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# --- PANDAS DISPLAY OPTIONS ---
# This is a critical setup step to ensure you can see all columns and rows.
pd.set_option('display.max_columns', 500) # Display up to 500 columns
pd.set_option('display.max_rows', 500)    # Display up to 500 rows
pd.set_option('display.width', 1000)      # Adjust the width of the display

# 1. Locate all input files
try:
    at = '../data/application_train.csv'
    print("File paths set successfully.")
except FileNotFoundError:
    print("Error: application_train.csv not found. Please check the file path.")



# 2. Read from the input files and convert them to dataframes
application_train = pd.read_csv(at)

# ==============================================================================
# STEP 1: THE 30,000-FOOT VIEW
# ==============================================================================

print("\n--- STEP 1: Macro Sanity Check ---")
print(f"Shape of the dataset: {application_train.shape}")
print("\nOverall Default Rate:")
print(application_train['TARGET'].value_counts(normalize=True))
print("\nFirst 5 rows of data:")
print(application_train.head())



# ==============================================================================
# STEP 2: AUTOMATED DATA PROFILIING & UNIVARIATE REPORT (with Excel Export)
# ==============================================================================

print("\n--- STEP 2: Generating Full Automated Data Quality Report ---")


from risk_governance_kit import data_profiling_report

# --- EXECUTION & EXPORT ---

# 1. Generate the full report for all 122 columns
quality_report = data_profiling_report(application_train, df_name="Application Train Data")

# 2. Define the output path for the Excel file
# This will save the file inside the dedicated reports folder in your project
output_path = r"C:\Users\Christina\Dropbox\PERSONAL\YUVARAJ\PROJECTS\credit_risk_major\outputs\reports\Data_Quality_Report.xlsx"

# 3. Export the DataFrame to an Excel file
# The `engine='openpyxl'` is required for modern .xlsx files.
# Make sure you have it installed: python -m pip install openpyxl
try:
    quality_report.to_excel(output_path, sheet_name='Data Quality Summary', index=True)
    print(f"\nSUCCESS: Full data quality report for all {len(quality_report)} columns has been exported to:")
    print(output_path)
except Exception as e:
    print(f"\nERROR: Could not export to Excel. Please ensure 'openpyxl' is installed.")
    print(f"You can install it by running this in your terminal: python -m pip install openpyxl")
    print(f"The error was: {e}")


# 4. (Optional) Display a snippet in the notebook for a quick preview
print("\nShowing a preview of the first 10 rows of the full report:")
print(quality_report.head(10))




# ==============================================================================
# STEP 2: BIIVARIATE REPORT of each feature vs. default rate (with 100 graphs)
# ==============================================================================

output_dir = r"C:\Users\Christina\Dropbox\PERSONAL\YUVARAJ\PROJECTS\credit_risk_major\outputs\charts\bivariate_plots"

def generate_bivariate_plots(df, target_col, output_dir = output_dir):
    """
    Automates the creation and saving of bivariate analysis plots for all features.
    """
    
    print(f"--- Starting Bivariate Plot Generation ---")
    os.makedirs(output_dir, exist_ok=True)
    print(f"Charts will be saved to: {output_dir}")
    
    predictors = [col for col in df.columns if col not in ['SK_ID_CURR', target_col] and '_Binned' not in col]
    total_cols = len(predictors)
    
    for i, col in enumerate(predictors):
        print(f"Processing ({i+1}/{total_cols}): {col}")
        fig, ax = plt.subplots(figsize=(12, 7))
        
        try:
            if df[col].dtype == 'object' or df[col].nunique() < 20:
                # --- CATEGORICAL PLOT ---
                analysis = df.groupby(col, dropna=False)[target_col].mean().sort_values(ascending=False)
                
                if len(analysis) > 20:
                    analysis = analysis.head(20)
                    plot_title = f'Default Rate by Top 20 Categories of {col}'
                else:
                    plot_title = f'Default Rate by {col}'
                
             
                sns.barplot(x=analysis.index.astype(str), y=analysis.values, ax=ax, palette="viridis", hue=analysis.index.astype(str), legend=False)
                # ------------------------------------
                
                ax.tick_params(axis='x', rotation=45)
                ax.set_ylabel('Default Rate')

            else:
                # --- NUMERIC PLOT ---
                binned_col = f'{col}_Binned'
                df[binned_col] = pd.qcut(df[col], 10, labels=False, duplicates='drop')
                analysis = df.groupby(binned_col)[target_col].mean()
                
                sns.lineplot(x=analysis.index, y=analysis.values, marker='o', ax=ax, color='navy')
                ax.set_xlabel(f'{col} Decile (0 = Lowest, 9 = Highest)')
                ax.set_ylabel('Default Rate')
                plot_title = f'Default Rate by Decile of {col}'
                ax.grid(True)

            ax.set_title(plot_title, fontsize=16, fontweight='bold', pad=20)
            plt.tight_layout()
            
            file_path = os.path.join(output_dir, f'{col}_bivariate_plot.png')
            plt.savefig(file_path, dpi=150)
            
        except Exception as e:
            print(f"    --> Could not generate plot for {col}. Error: {e}")
        finally:
            plt.close(fig)

    print("\n--- Bivariate Plot Generation Complete ---")

# CALL the function, passing our dataframe to it.
generate_bivariate_plots(df=application_train, target_col='TARGET')

# --- FINAL SUMMARY ---
print("\n--- Initial EDA Summary ---")
print("1. The portfolio default rate is approximately {:.2%}".format(application_train['TARGET'].mean()))
print("2. Several columns related to apartment/building specifics have >50% missing data and are likely unusable without significant imputation.")
print("3. External bureau scores (like EXT_SOURCE_3) show a strong, monotonic relationship with default, confirming they are powerful predictors.")
print("4. Education level shows a clear risk ordering, with lower levels of education corresponding to higher default rates.")
print(f"5. Data sanity and EDA analysis of each feature (univariate analysis) is saved as an excel file in '{output_path}'")
print(f"6. Relationships of all the features with default rate are saved as 120 plots in '{output_dir}'")






File paths set successfully.

--- STEP 1: Macro Sanity Check ---
Shape of the dataset: (307511, 122)

Overall Default Rate:
TARGET
0    0.919271
1    0.080729
Name: proportion, dtype: float64

First 5 rows of data:
   SK_ID_CURR  TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY  CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  AMT_GOODS_PRICE NAME_TYPE_SUITE NAME_INCOME_TYPE            NAME_EDUCATION_TYPE    NAME_FAMILY_STATUS  NAME_HOUSING_TYPE  REGION_POPULATION_RELATIVE  DAYS_BIRTH  DAYS_EMPLOYED  DAYS_REGISTRATION  DAYS_ID_PUBLISH  OWN_CAR_AGE  FLAG_MOBIL  FLAG_EMP_PHONE  FLAG_WORK_PHONE  FLAG_CONT_MOBILE  FLAG_PHONE  FLAG_EMAIL OCCUPATION_TYPE  CNT_FAM_MEMBERS  REGION_RATING_CLIENT  REGION_RATING_CLIENT_W_CITY WEEKDAY_APPR_PROCESS_START  HOUR_APPR_PROCESS_START  REG_REGION_NOT_LIVE_REGION  REG_REGION_NOT_WORK_REGION  LIVE_REGION_NOT_WORK_REGION  REG_CITY_NOT_LIVE_CITY  REG_CITY_NOT_WORK_CITY  LIVE_CITY_NOT_WORK_CITY       ORGANIZATION_TYPE  EXT_SOURCE_1  

In [None]:
# Run this cell after restarting the kernel
import risk_governance_kit
import importlib

# This forces a reload, bypassing the cache
importlib.reload(risk_governance_kit)

# This will print a list of everything the package is exposing
# We are looking for 'data_profiling_report' in this list
print(dir(risk_governance_kit))