# 1. Data Exploration & Cleaning

This notebook demonstrates basic data ingestion, cleaning, and transformation for the MetalX Smelting project.

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style='whitegrid')

## 1.1 Read Raw Data

In [None]:
RAW_DATA_PATH = '../data/raw/'
PROCESSED_DATA_PATH = '../data/processed/'

filenames = ['sensor_data_day1.csv', 'sensor_data_day2.csv']

# Load multiple CSVs into a single DataFrame or keep separate
df_list = []
for fname in filenames:
    fpath = os.path.join(RAW_DATA_PATH, fname)
    temp_df = pd.read_csv(fpath, parse_dates=['timestamp'])
    df_list.append(temp_df)

raw_data = pd.concat(df_list, ignore_index=True)
raw_data.head()

## 1.2 Basic Exploration

- Check for missing values
- Describe statistical distribution

In [None]:
print('Missing values per column:')
print(raw_data.isna().sum())

print('\nStatistical summary:')
display(raw_data.describe())

## 1.3 Detect Outliers (Naive Approach)
We'll assume outliers if voltage/current/temperature deviate significantly from the mean.

> Note: In a real environment, you'd have domain-specific thresholds or advanced anomaly detection.

In [None]:
# Let's define a simple function for outlier detection
def mark_outliers(df, col, z_thresh=2.5):
    mean_val = df[col].mean()
    std_val = df[col].std()
    cutoff_upper = mean_val + z_thresh * std_val
    cutoff_lower = mean_val - z_thresh * std_val
    return (df[col] < cutoff_lower) | (df[col] > cutoff_upper)

# Make a copy
clean_data = raw_data.copy()
clean_data['is_outlier'] = False

for c in ['voltage', 'current', 'temperature']:
    outlier_mask = mark_outliers(clean_data, c, z_thresh=2.5)
    clean_data.loc[outlier_mask, 'is_outlier'] = True

print(f"Total outliers found: {clean_data['is_outlier'].sum()}")

#### 1.3.1 Remove or Keep Outliers?
For demonstration, we'll keep them but flagged. In a real pipeline, you might remove them or handle them case-by-case.

In [None]:
# Let's quickly visualize distributions
fig, axs = plt.subplots(1, 3, figsize=(15, 4))
for i, c in enumerate(['voltage', 'current', 'temperature']):
    sns.histplot(data=clean_data, x=c, hue='is_outlier', ax=axs[i], kde=True)
    axs[i].set_title(f"Distribution of {c}")
plt.tight_layout()
plt.show()

## 1.4 Save Processed Data
We'll split them back by day to mimic a daily pipeline run.

In [None]:
# Filter data by day
day1 = clean_data[clean_data['timestamp'].dt.day == 1]
day2 = clean_data[clean_data['timestamp'].dt.day == 2]

day1_path = os.path.join(PROCESSED_DATA_PATH, 'sensor_data_day1_cleaned.csv')
day2_path = os.path.join(PROCESSED_DATA_PATH, 'sensor_data_day2_cleaned.csv')

day1.to_csv(day1_path, index=False)
day2.to_csv(day2_path, index=False)

print('Processed data saved successfully!')