## 1. import package

In [0]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Display charts inline
%matplotlib inline


## 2. Load Dataset

In [0]:


file_path = "/dbfs/FileStore/tables/kidney_disease.csv"
df = pd.read_csv(file_path)
df.head()


## 3. Basic Overview of Data

In [0]:
# Dataset shape
print("Shape:", df.shape)

# Data types and missing values
df.info()

# Summary statistics
df.describe(include='all')


In [0]:
df.drop('id', axis=1, inplace=True)


In [0]:
print("Total missing values:", df.isnull().sum().sum())


## 4. Handle Missing & Inconsistent Values

In [0]:
# Replace ? with NaN
df.replace("?", np.nan, inplace=True)

# Convert to correct types (especially numerical)
for col in ['age', 'bp', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Check again
df.isnull().sum().sort_values(ascending=False)


In [0]:
df['classification'] = df['classification'].str.strip().str.lower()
# Fix Label Issues in Class Distribution

## 5. Class Distribution

In [0]:
# Check balance of CKD vs Not CKD
df['classification'].value_counts().plot(kind='bar', title='Class Distribution')




From the chart, looks like it has 250 ckd and 150 notckd. A slight imbalance — not extreme, but may influence model results. We can address it later.

## 6. Visualizations

In [0]:
# Plot distribution of numerical features
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns

for col in numerical_cols:
    plt.figure(figsize=(6, 4))
    sns.histplot(data=df, x=col, kde=True)
    plt.title(f"Distribution of {col}")
    plt.show()


In [0]:
df.to_csv("/dbfs/FileStore/tables/kidney_disease_cleaned.csv", index=False)


In [0]:
# Correlation heatmap (numeric only)
plt.figure(figsize=(12, 8))
sns.heatmap(df[numerical_cols].corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Between Numerical Features")
plt.show()


##Key Insights from EDA:
- Dataset contains 400 samples with multiple clinical indicators
- Around 15–20 features contain missing values, which will be imputed later
- Class distribution is slightly imbalanced (~60% CKD, ~40% Not CKD)
- Some numerical features are skewed (e.g., `su`, `bu`, `sc`) and may need transformation
- Strong correlations between: `pcv`, `hemo`, `rc`, `classification` – potential key features
