# StepReaderCNN: Data Exploration

This notebook explores the electrochemical sensor signal dataset.

In [None]:
# Import libraries
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from src.data.data_loader import SensorDataLoader, create_label_mapping

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

## 1. Load Dataset

In [None]:
loader = SensorDataLoader('../TestData')
data_by_label = loader.load_dataset('*.csv')

print(f'Total files: {sum(len(v) for v in data_by_label.values())}')
for label in sorted(data_by_label.keys()):
    print(f'{label}: {len(data_by_label[label])} samples')

## 2. Dataset Summary

In [None]:
summary_df = loader.get_dataset_summary(data_by_label)
summary_df

## 3. Class Distribution

In [None]:
class_counts = summary_df['Label'].value_counts().sort_index()

fig, axes = plt.subplots(1, 2, figsize=(14, 5))
class_counts.plot(kind='bar', ax=axes[0])
axes[0].set_title('Class Distribution')
axes[0].set_xlabel('Particle Size')
axes[0].set_ylabel('Count')

axes[1].pie(class_counts, labels=class_counts.index, autopct='%1.1f%%')
axes[1].set_title('Class Distribution (%)')
plt.tight_layout()
plt.show()

## 4. Signal Visualization

In [None]:
fig, axes = plt.subplots(len(data_by_label), 1, figsize=(15, 4*len(data_by_label)))
if len(data_by_label) == 1:
    axes = [axes]

for idx, (label, samples) in enumerate(sorted(data_by_label.items())):
    ax = axes[idx]
    for i in range(min(3, len(samples))):
        time, current, filename = samples[i]
        ax.plot(time, current, alpha=0.7, label=filename)
    ax.set_title(f'Sample Signals - {label}')
    ax.set_xlabel('Time (ms)')
    ax.set_ylabel('Current')
    ax.legend()
    ax.grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 5. Statistical Analysis

In [None]:
aggregated = summary_df.groupby('Label').agg({
    'Num_Points': ['mean', 'std', 'min', 'max'],
    'Current_Mean': ['mean', 'std'],
    'Current_Std': 'mean'
})
print(aggregated)

## 6. Data Quality Check

In [None]:
print('Data Quality Report:')
print('='*50)
nan_files = summary_df[summary_df['Has_NaN'] == True]
if len(nan_files) > 0:
    print(f'WARNING: {len(nan_files)} files with NaN')
else:
    print('No NaN values detected')

print(f'
Sampling rate: {summary_df["Sampling_Rate_Hz"].mean():.2f} Hz')
print('='*50)

## 7. Save Report

In [None]:
summary_df.to_csv('../outputs/reports/data_summary.csv', index=False)
print('Summary saved to outputs/reports/data_summary.csv')