# Exploratory Data Analysis (EDA)

This notebook is used for performing exploratory data analysis on the diabetes dataset. The goal is to visualize the data and gain insights that can help in understanding the factors affecting readmission risk.

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set(style='whitegrid')

In [2]:
# Load the cleaned data
cleaned_data = pd.read_csv('../data/processed/cleaned_data.csv')

# Display the first few rows of the dataset
cleaned_data.head()

In [3]:
# Summary statistics of the dataset
cleaned_data.describe()

In [4]:
# Visualize the distribution of the target variable (readmitted)
plt.figure(figsize=(8, 6))
sns.countplot(x='readmitted', data=cleaned_data)
plt.title('Distribution of Readmission')
plt.xlabel('Readmitted (1 = Yes, 0 = No)')
plt.ylabel('Count')
plt.show()

In [5]:
# Correlation heatmap
plt.figure(figsize=(12, 8))
correlation_matrix = cleaned_data.corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [6]:
# Pairplot for visualizing relationships between features
sns.pairplot(cleaned_data, hue='readmitted')
plt.show()