# Exploratory Data Analysis for HealthTimeSeriesAI

This notebook performs exploratory data analysis on the time series dataset (`data/raw/sample_dataset.csv`) to understand its structure, visualize patterns, and identify characteristics like sparsity or class imbalance.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## Load Dataset
Load the sample time series dataset with 187 time steps and a binary label.

In [None]:
df = pd.read_csv('../data/raw/sample_dataset.csv')
feature_cols = [str(i) for i in range(1, 188)]
label_col = 'Label'

print('Dataset Shape:', df.shape)
print('First few rows:')
df.head()

## Basic Statistics
Compute summary statistics for the features and check for missing values.

In [None]:
print('Summary Statistics:')
print(df[feature_cols].describe())

print('\nMissing Values:')
print(df.isnull().sum().sum())

## Class Distribution
Analyze the distribution of the binary labels to check for class imbalance.

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(x=label_col, data=df)
plt.title('Class Distribution')
plt.xlabel('Label')
plt.ylabel('Count')
plt.savefig('class_distribution.png')
plt.show()

print('Class Distribution:')
print(df[label_col].value_counts())

## Visualize Time Series
Plot a few time series sequences to understand their patterns.

In [None]:
plt.figure(figsize=(12, 6))
for idx in range(min(3, len(df))):  # Plot up to 3 sequences
    plt.plot(feature_cols, df[feature_cols].iloc[idx], label=f'Sample {idx+1} (Label: {df[label_col].iloc[idx]})')
plt.title('Sample Time Series Sequences')
plt.xlabel('Time Step')
plt.ylabel('Value')
plt.legend()
plt.savefig('time_series_samples.png')
plt.show()

## Sparsity Analysis
Check for sparsity (zero or near-zero values) in the time series.

In [None]:
sparsity = (df[feature_cols] == 0).mean()
plt.figure(figsize=(12, 4))
plt.plot(feature_cols, sparsity)
plt.title('Sparsity Across Time Steps')
plt.xlabel('Time Step')
plt.ylabel('Proportion of Zero Values')
plt.savefig('sparsity_analysis.png')
plt.show()

print('Average Sparsity:', sparsity.mean())

## Correlation Analysis
Examine correlations between time steps to identify temporal dependencies.

In [None]:
correlation_matrix = df[feature_cols].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Time Steps')
plt.savefig('correlation_matrix.png')
plt.show()