# 01. Data Exploration

Initial exploration of the Gas Sensor Array Drift Dataset to understand:
- Dataset structure and dimensions
- Temporal distribution of batches
- Chemical compound distribution
- Sensor response patterns
- Missing values and data quality

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 11

## Load Dataset

The Gas Sensor Array Drift Dataset contains:
- 128 chemical sensors (metal oxide sensors)
- 6 different gas types
- 5 batches collected over 36 months

In [None]:
# TODO: Update with actual data loading code once you have the dataset
# The dataset can be downloaded from UCI Machine Learning Repository
# https://archive.ics.uci.edu/ml/datasets/Gas+Sensor+Array+Drift+Dataset

# Placeholder for data loading
# df = pd.read_csv('../data/raw/sensor_data.csv')
# print(f"Dataset shape: {df.shape}")
# df.head()

## Dataset Overview

In [None]:
# Basic dataset information
# print("\nDataset Info:")
# print(df.info())

# print("\nBasic Statistics:")
# print(df.describe())

## Batch Distribution Over Time

In [None]:
# Analyze temporal distribution of batches
# batch_counts = df.groupby('batch').size()

# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# # Batch sizes
# batch_counts.plot(kind='bar', ax=ax1)
# ax1.set_title('Number of Samples per Batch')
# ax1.set_xlabel('Batch ID')
# ax1.set_ylabel('Number of Samples')

# # Timeline
# # Assuming batch numbers correspond to time periods
# batch_timeline = pd.DataFrame({
#     'batch': [1, 5, 10, 15, 20],
#     'month': [0, 3, 12, 18, 24]
# })
# ax2.plot(batch_timeline['month'], batch_timeline['batch'], 'o-', markersize=10)
# ax2.set_title('Batch Collection Timeline')
# ax2.set_xlabel('Months from Start')
# ax2.set_ylabel('Batch Number')
# ax2.grid(True, alpha=0.3)

# plt.tight_layout()
# plt.show()

## Chemical Compound Distribution

In [None]:
# Analyze distribution of chemical compounds
# chemical_dist = df.groupby(['batch', 'chemical']).size().unstack(fill_value=0)

# fig, ax = plt.subplots(figsize=(12, 6))
# chemical_dist.plot(kind='bar', stacked=True, ax=ax)
# ax.set_title('Chemical Distribution Across Batches')
# ax.set_xlabel('Batch')
# ax.set_ylabel('Number of Samples')
# ax.legend(title='Chemical', bbox_to_anchor=(1.05, 1), loc='upper left')
# plt.tight_layout()
# plt.show()

## Sensor Response Patterns

In [None]:
# Visualize sensor response patterns
# sensor_cols = [col for col in df.columns if col.startswith('sensor_')]

# # Sample mean responses for first 10 sensors
# fig, axes = plt.subplots(2, 5, figsize=(15, 8))
# axes = axes.flatten()

# for i, sensor in enumerate(sensor_cols[:10]):
#     df.groupby('chemical')[sensor].mean().plot(kind='bar', ax=axes[i])
#     axes[i].set_title(f'{sensor}')
#     axes[i].set_xlabel('')
#     axes[i].tick_params(axis='x', rotation=45)

# plt.suptitle('Mean Sensor Responses by Chemical (First 10 Sensors)', y=1.02)
# plt.tight_layout()
# plt.show()

## Correlation Analysis

In [None]:
# Compute correlation matrix for sensors
# sensor_corr = df[sensor_cols].corr()

# fig, ax = plt.subplots(figsize=(12, 10))
# sns.heatmap(sensor_corr[:20, :20], cmap='coolwarm', center=0,
#             square=True, ax=ax, cbar_kws={'shrink': 0.8})
# ax.set_title('Sensor Correlation Matrix (First 20 Sensors)')
# plt.tight_layout()
# plt.show()

# print(f"Mean correlation: {sensor_corr.values[np.triu_indices_from(sensor_corr.values, 1)].mean():.3f}")
# print(f"Correlation range: [{sensor_corr.values.min():.3f}, {sensor_corr.values.max():.3f}]")

## Missing Values Analysis

In [None]:
# Check for missing values
# missing = df.isnull().sum()
# if missing.any():
#     print("Missing values per column:")
#     print(missing[missing > 0].sort_values(ascending=False))
#     
#     # Visualize missing pattern
#     fig, ax = plt.subplots(figsize=(12, 6))
#     sns.heatmap(df.isnull(), cbar=True, yticklabels=False, ax=ax)
#     ax.set_title('Missing Value Pattern')
#     plt.show()
# else:
#     print("No missing values found in the dataset!")

## Initial Drift Visualization

In [None]:
# Visualize drift for a specific sensor across batches
# fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# for i, sensor in enumerate(sensor_cols[:3]):
#     for batch in df['batch'].unique():
#         batch_data = df[df['batch'] == batch][sensor]
#         axes[i].hist(batch_data, alpha=0.5, label=f'Batch {batch}', bins=30)
#     
#     axes[i].set_title(f'{sensor} Distribution Across Batches')
#     axes[i].set_xlabel('Sensor Response')
#     axes[i].set_ylabel('Frequency')
#     axes[i].legend()

# plt.suptitle('Sensor Drift Visualization', y=1.02)
# plt.tight_layout()
# plt.show()

## Summary Statistics by Batch

In [None]:
# Compute summary statistics for each batch
# for batch in sorted(df['batch'].unique()):
#     batch_data = df[df['batch'] == batch]
#     print(f"\nBatch {batch}:")
#     print(f"  - Number of samples: {len(batch_data)}")
#     print(f"  - Number of chemicals: {batch_data['chemical'].nunique()}")
#     print(f"  - Mean sensor response: {batch_data[sensor_cols].mean().mean():.3f}")
#     print(f"  - Std sensor response: {batch_data[sensor_cols].std().mean():.3f}")

## Key Observations

Document your findings here:
- Dataset dimensions and structure
- Temporal distribution of batches
- Evidence of sensor drift
- Correlation patterns among sensors
- Data quality issues (if any)

In [None]:
# Save processed data for next notebooks
# df.to_csv('../data/processed/sensor_data_cleaned.csv', index=False)
# print("Cleaned data saved to data/processed/")