In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.impute import SimpleImputer

# Load dataset
df = pd.read_csv("Dataset.csv")

# Convert suspected numeric columns
numeric_fix = ['Client_Income', 'Credit_Amount', 'Loan_Annuity', 'Age_Days',
               'Employed_Days', 'Registration_Days', 'ID_Days', 'Score_Source_3']
for col in numeric_fix:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Separate features
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# --- 1. Missing Value Summary ---
missing_summary = df.isnull().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

print("===== MISSING VALUES =====")
print(missing_summary)

# --- 2. Correlation Matrix (Numerical Only) ---
correlation_matrix = df[numerical_cols].corr()

# Display top correlated pairs
def top_corr_pairs(corr_matrix, threshold=0.8):
    corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            corr_val = corr_matrix.iloc[i, j]
            if abs(corr_val) > threshold:
                corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_val))
    return sorted(corr_pairs, key=lambda x: abs(x[2]), reverse=True)

high_corr = top_corr_pairs(correlation_matrix)

print("\n===== HIGH CORRELATIONS (|r| > 0.8) =====")
for a, b, r in high_corr:
    print(f"{a} ↔ {b} = {r:.2f}")

# --- 3. Multicollinearity Check (VIF) ---
# Impute missing numeric values for VIF calc
imputer = SimpleImputer(strategy='median')
X_num = pd.DataFrame(imputer.fit_transform(df[numerical_cols]), columns=numerical_cols)

vif_df = pd.DataFrame()
vif_df["feature"] = X_num.columns
vif_df["VIF"] = [variance_inflation_factor(X_num.values, i) for i in range(X_num.shape[1])]
vif_df = vif_df.sort_values("VIF", ascending=False)

print("\n===== MULTICOLLINEARITY (VIF > 10 Suggests Redundancy) =====")
print(vif_df[vif_df["VIF"] > 10])

# --- 4. Categorical Cardinality ---
cat_cardinality = {col: df[col].nunique() for col in categorical_cols}
cat_cardinality = dict(sorted(cat_cardinality.items(), key=lambda x: x[1], reverse=True))

print("\n===== HIGH-CARDINALITY CATEGORICAL FEATURES =====")
for col, count in cat_cardinality.items():
    if count > 20:
        print(f"{col}: {count} unique categories")

# Optional: plot correlation heatmap
# sns.heatmap(correlation_matrix, cmap='coolwarm', annot=False)
# plt.title("Correlation Matrix")
# plt.show()