# 01_EDA_Statistics
This notebook performs exploratory data analysis and basic cleaning.

In [None]:
# Common imports for the project
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')
%matplotlib inline


In [None]:
data_path = '../data/merged_data.csv'
if not os.path.exists(data_path):
    print(f'File not found: {data_path}. Please place your merged_data.csv in data/')
else:
    df = pd.read_csv(data_path)
    print('Loaded', df.shape)
    display(df.head())

In [None]:
df.info()

In [None]:
display(df.describe(include='all').T)

In [None]:
missing = df.isnull().mean().sort_values(ascending=False)
missing = missing[missing>0]
display(missing.head(30))

In [None]:
if 'default' in df.columns:
    display(df['default'].value_counts(normalize=True))
    sns.countplot(x='default', data=df)
    plt.title('Default class distribution')
else:
    print('No column named default found. Check your dataset.')

In [None]:
num = df.select_dtypes(include=[np.number]).copy()
corr = num.corr()['default'].abs().sort_values(ascending=False).head(20)
display(corr)
plt.figure(figsize=(8,6))
sns.heatmap(num[corr.index].corr(), annot=False, cmap='coolwarm', center=0)
plt.title('Correlation matrix (top features)')

In [None]:
sample_path = '../data/merged_sample.csv'
df.sample(frac=0.2, random_state=42).to_csv(sample_path, index=False)
print('Saved sample to', sample_path)