# Dataset Preprocessing - Kaggle Environment

This notebook demonstrates the complete pipeline for:
1. Loading the Edge-IIoT dataset (already available in Kaggle environment)
2. Loading and analyzing the datasets
3. Data cleaning and validation
4. Merging datasets and organizing by device
5. Exporting processed data for streaming

Dataset: [Edge-IIoT Set Dataset](https://www.kaggle.com/datasets/sibasispradhan/edge-iiotset-dataset)

The Edge-IIoT dataset contains sensor data from IoT edge devices for anomaly detection research.

## 1. Install required packages

In [None]:
!pip install --quiet pandas numpy python-dotenv

In [None]:
import os
import json
import time
from datetime import datetime
from pathlib import Path

import pandas as pd
import numpy as np

print("Libraries imported successfully")

## 2. Load Dataset from Kaggle Input

In Kaggle, the dataset is automatically available in `/kaggle/input/`

In [None]:
# Kaggle dataset path - automatically available in the environment
data_dir = Path('/kaggle/input/edge-iiotset-dataset')

# Verify the directory exists
if data_dir.exists():
    print(f"Dataset directory found: {data_dir}")
    print(f"\nFiles in dataset:")
    for file in sorted(data_dir.glob('*.csv')):
        size_mb = file.stat().st_size / (1024 * 1024)
        print(f"  {file.name} ({size_mb:.2f} MB)")
else:
    print(f"ERROR: Dataset not found at {data_dir}")
    print("Make sure the Edge-IIoT dataset is added to your Kaggle notebook.")

## 3. Load and Explore Datasets

Load all CSV files and analyze their structure and content.

In [None]:
csv_files = sorted(data_dir.glob('*.csv'))
print(f"Found {len(csv_files)} CSV files\n")

datasets = {}
for csv_file in csv_files:
    print(f"Loading {csv_file.name}...")
    df = pd.read_csv(csv_file)
    datasets[csv_file.stem] = df
    print(f"  Shape: {df.shape}")
    print(f"  Columns: {len(df.columns)}")
    print(f"  Memory: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB\n")

print(f"Loaded {len(datasets)} datasets")

In [None]:
for name, df in datasets.items():
    print(f"\n{'='*60}")
    print(f"Dataset: {name}")
    print(f"{'='*60}")
    print(f"\nDimensions: {df.shape[0]} rows x {df.shape[1]} columns")
    print(f"\nData types:")
    print(df.dtypes)
    print(f"\nFirst few rows:")
    print(df.head())
    print(f"\nMissing values:")
    missing = df.isnull().sum()
    if missing.sum() == 0:
        print("None")
    else:
        print(missing[missing > 0])

## 4. Data Cleaning

**Remove duplicates, handle missing values, and convert data types.**

In [None]:
def clean_dataset(df, name):
    """Clean and validate dataset"""
    print(f"Cleaning {name}...")
    
    initial_rows = len(df)
    
    # Remove duplicates
    df_clean = df.drop_duplicates().reset_index(drop=True)
    duplicates = initial_rows - len(df_clean)
    if duplicates > 0:
        print(f"  Removed {duplicates} duplicate rows")
    
    # Drop rows with any missing values
    rows_before = len(df_clean)
    df_clean = df_clean.dropna()
    missing_removed = rows_before - len(df_clean)
    if missing_removed > 0:
        print(f"  Removed {missing_removed} rows with missing values")
    
    # Convert object columns to numeric where possible
    converted_success = 0
    for col in df_clean.columns:
        if df_clean[col].dtype == 'object':
            try:
                converted = pd.to_numeric(df_clean[col], errors='coerce')
                if converted.notna().sum() > 0:
                    df_clean[col] = converted
                    converted_success += 1
            except:
                pass
    if converted_success > 0:
        print(f"  Converted {converted_success} object columns to numeric")
    
    # Add source dataset indicator
    df_clean['dataset_source'] = name
    
    print(f"  Final: {len(df_clean):,} rows ({100*len(df_clean)/initial_rows:.1f}% retained)\n")
    return df_clean

cleaned_datasets = {}
for name, df in datasets.items():
    cleaned_datasets[name] = clean_dataset(df, name)

## 5. Merge Datasets

Combine all cleaned datasets into a single dataframe.

In [None]:
print("Merging datasets...")

# Analyze columns across datasets
all_columns = set()
for df in cleaned_datasets.values():
    all_columns.update(df.columns)

print(f"Total unique columns across all datasets: {len(all_columns)}")

# Find common columns
common_cols = set(cleaned_datasets[list(cleaned_datasets.keys())[0]].columns)
for df in list(cleaned_datasets.values())[1:]:
    common_cols &= set(df.columns)

print(f"Columns in all datasets: {len(common_cols)}")
print(f"Common columns: {sorted(common_cols)}\n")

# Merge all datasets
df_merged = pd.concat(cleaned_datasets.values(), ignore_index=True, sort=False)

print(f"Merged dataset shape: {df_merged.shape}")
print(f"Total rows: {len(df_merged):,}")
print(f"Total columns: {len(df_merged.columns)}")

print(f"\nDataset sources distribution:")
print(df_merged['dataset_source'].value_counts())

# Calculate data sparsity
total_cells = df_merged.shape[0] * df_merged.shape[1]
non_null = df_merged.notna().sum().sum()
sparsity = (1 - non_null / total_cells) * 100

print(f"\nData quality metrics:")
print(f"  Total cells: {total_cells:,}")
print(f"  Non-null cells: {non_null:,}")
print(f"  Sparsity: {sparsity:.1f}%")

print(f"\nMissing values per column (top 10):")
missing_per_col = df_merged.isnull().sum().sort_values(ascending=False)
print(missing_per_col.head(10))

## 6. Device Identification and Grouping

In [None]:
# Check for existing device/identifier columns
print("Looking for device identifier columns...")
device_col_candidates = ['device_id', 'Device_ID', 'DeviceID', 'device', 'Device', 'id', 'ID', 'src_ip', 'dst_ip', 'ip.src', 'ip.dst']

device_col = None
for col in device_col_candidates:
    if col in df_merged.columns:
        # Check if column has meaningful values (not all NaN)
        if df_merged[col].notna().sum() > 0:
            device_col = col
            print(f"Found column: {col}")
            break

if device_col is None:
    print("No device identifier column found")
    print(f"Using dataset source + random grouping instead\n")
    
    # Group by dataset source and create device IDs within each
    num_devices = max(5, len(df_merged) // 1000)
    df_merged['device_id'] = df_merged.groupby('dataset_source').cumcount() % num_devices
    print(f"Created {num_devices} synthetic device IDs per dataset")
else:
    print(f"Using existing device column: {device_col}")
    df_merged.rename(columns={device_col: 'device_id'}, inplace=True)

print(f"\nDevices distribution:")
print(df_merged['device_id'].value_counts().sort_index())

print(f"\nDevice counts by dataset:")
print(df_merged.groupby(['dataset_source', 'device_id']).size().unstack(fill_value=0))

In [None]:
print("Grouping data by device...")

device_groups = {}
for device_id, group in df_merged.groupby('device_id', sort=False):
    device_groups[str(device_id)] = group.copy(deep=False)

print(f"Created {len(device_groups)} device groups\n")

print("Device group statistics:")
print("-" * 60)
for device_id, group in sorted(device_groups.items()):
    print(f"Device {device_id}: {len(group)} rows")

print(f"\nData grouped by device")

## 7. Export Results

Save preprocessed data and statistics to Kaggle output directory.

In [None]:
# In Kaggle, save outputs to /kaggle/working/
output_dir = Path('/kaggle/working/edge_iiot_processed')
output_dir.mkdir(exist_ok=True)

print(f"Exporting processed data to {output_dir}\n")

# Export merged dataset
merged_file = output_dir / 'merged_data.csv'
df_merged.to_csv(merged_file, index=False)
print(f"Merged data: {merged_file}")
print(f"  Size: {merged_file.stat().st_size / 1024**2:.2f} MB")

# Export device-specific files
print(f"\nDevice files:")
for device_id, df_device in device_groups.items():
    device_file = output_dir / f'device_{device_id}.csv'
    df_device.to_csv(device_file, index=False)
    print(f"  device_{device_id}.csv ({len(df_device)} rows)")

# Export summary
summary = {
    'total_records': len(df_merged),
    'total_devices': len(device_groups),
    'total_columns': len(df_merged.columns),
    'dataset_sources': df_merged['dataset_source'].value_counts().to_dict(),
    'processing_timestamp': datetime.now().isoformat()
}

summary_file = output_dir / 'processing_summary.json'
with open(summary_file, 'w') as f:
    json.dump(summary, f, indent=2)

print(f"\nProcessing summary:")
for key, value in summary.items():
    print(f"  {key}: {value}")

print(f"\nExport complete!")

## Summary

This notebook completed the following steps:

1. Loaded the Edge-IIoT dataset from Kaggle environment
2. Loaded and analyzed three CSV files
3. Cleaned data by removing duplicates and handling missing values
4. Merged datasets into a consolidated dataframe
5. Organized data by device identifier
6. Exported processed data to multiple formats

### Output files
- `merged_data.csv` - Complete merged dataset
- `device_*.csv` - Per-device data files  
- `processing_summary.json` - Processing statistics

All outputs are saved to `/kaggle/working/edge_iiot_processed/` and ready for download.