In [None]:
# ---------------------------
# Medical Charges – Quick EDA
# pandas, matplotlib, seaborn
# ---------------------------

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# 1. Load dataset
df = pd.read_csv("../data/insurance.csv")

# 2. Basic info
print("\n--- Dataset shape ---")
print(df.shape)

print("\n--- Data types & non-null counts ---")
print(df.info())

print("\n--- Missing values ---")
print(df.isna().sum())

print("\n--- Descriptive statistics ---")
print(df.describe().T)

# 3. Target distribution
sns.histplot(df["charges"], kde=True, bins=30)
plt.title("Distribution of Medical Charges")
plt.tight_layout()
plt.show()

# 4. Categorical features vs charges
fig, ax = plt.subplots(1, 3, figsize=(16, 4))
sns.boxplot(x="sex",       y="charges", data=df, ax=ax[0])
sns.boxplot(x="smoker",    y="charges", data=df, ax=ax[1])
sns.boxplot(x="region",    y="charges", data=df, ax=ax[2])
fig.suptitle("Charges by Categorical Variables")
plt.tight_layout()
plt.show()

# 5. Correlation heatmap
corr = df.corr(numeric_only=True)
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Matrix")
plt.tight_layout()
plt.show()

# 6. Age vs charges (smoker overlay)
sns.scatterplot(x="age", y="charges", hue="smoker", data=df, alpha=0.7)
plt.title("Age vs Charges (Smoking Status)")
plt.tight_layout()
plt.show()

# 7. BMI vs charges (smoker overlay)
sns.scatterplot(x="bmi", y="charges", hue="smoker", data=df, alpha=0.7)
plt.title("BMI vs Charges (Smoking Status)")
plt.tight_layout()
plt.show()

# 8. Children count & charges
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
sns.countplot(x="children", data=df, ax=ax[0])
ax[0].set_title("Number of Children")
sns.boxplot(x="children", y="charges", data=df, ax=ax[1])
ax[1].set_title("Charges by Children Count")
plt.tight_layout()
plt.show()