# Data Exploration - MedChain-FL

This notebook explores the synthetic thalassemia CBC data.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

sns.set_style('whitegrid')

In [None]:
# Load data from all hospitals
hospitals = ['italy', 'pakistan', 'usa']
data_frames = []

for hospital in hospitals:
    path = Path(f'../data/hospital_{hospital}/cbc_data.csv')
    if path.exists():
        df = pd.read_csv(path)
        df['hospital'] = hospital
        data_frames.append(df)

data = pd.concat(data_frames, ignore_index=True)
print(f"Total samples: {len(data)}")
data.head()

In [None]:
# Class distribution
plt.figure(figsize=(10, 6))
data['condition'].value_counts().plot(kind='bar')
plt.title('Condition Distribution')
plt.xlabel('Condition')
plt.ylabel('Count')
plt.show()

In [None]:
# Feature distributions by condition
features = ['hb', 'rbc', 'mcv', 'mch', 'mchc', 'rdw']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for idx, feature in enumerate(features):
    for condition in ['normal', 'minor', 'major']:
        subset = data[data['condition'] == condition]
        axes[idx].hist(subset[feature], alpha=0.5, label=condition, bins=30)
    
    axes[idx].set_xlabel(feature)
    axes[idx].set_ylabel('Frequency')
    axes[idx].legend()
    axes[idx].set_title(f'{feature} Distribution')

plt.tight_layout()
plt.show()