# Data Processing Template

**Purpose**: [Describe what this notebook does]

**Input**: [Source data files]

**Output**: [What this notebook produces]

**Date**: [YYYY-MM-DD]

## 1. Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import json
from pathlib import Path

# For visualizations (optional)
import matplotlib.pyplot as plt
import seaborn as sns

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set style for plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 2. Load Raw Data

In [None]:
# Load data from raw-data folder
raw_data_path = Path('../raw-data')

# Example: Load CSV
# df = pd.read_csv(raw_data_path / 'your_data.csv')

# Example: Load Excel
# df = pd.read_excel(raw_data_path / 'your_data.xlsx', sheet_name='Sheet1')

# Display basic info
# print(f"Shape: {df.shape}")
# df.head()

## 3. Data Exploration and Quality Checks

In [None]:
# Check for missing values
# df.isnull().sum()

# Check data types
# df.dtypes

# Basic statistics
# df.describe()

# Check for duplicates
# df.duplicated().sum()

## 4. Data Cleaning and Transformation

In [None]:
# Example transformations:

# Remove duplicates
# df_clean = df.drop_duplicates()

# Handle missing values
# df_clean = df_clean.dropna(subset=['important_column'])
# df_clean['column'] = df_clean['column'].fillna(0)

# Convert data types
# df_clean['date'] = pd.to_datetime(df_clean['date'])
# df_clean['value'] = df_clean['value'].astype(float)

# Filter data
# df_clean = df_clean[df_clean['year'] >= 2020]

# Create new columns
# df_clean['year'] = df_clean['date'].dt.year

## 5. Save Intermediate Results

In [None]:
# Save cleaned data to intermediate folder
intermediate_path = Path('../intermediate')
intermediate_path.mkdir(exist_ok=True)

# df_clean.to_csv(intermediate_path / 'cleaned_data.csv', index=False)
# print(f"Saved cleaned data: {df_clean.shape}")

## 6. Aggregate for Visualization

In [None]:
# Example aggregations:

# Group by category
# summary = df_clean.groupby('category').agg({
#     'value': ['sum', 'mean', 'count']
# }).reset_index()

# Time series aggregation
# time_series = df_clean.groupby('date').agg({
#     'value': 'sum'
# }).reset_index()

# Top N items
# top_10 = df_clean.nlargest(10, 'value')

## 7. Create Reduced Dataset for D3.js

In [None]:
# Prepare data for web visualization
# Keep only necessary columns
# viz_data = summary[['category', 'value', 'count']].copy()

# Convert dates to ISO format strings
# viz_data['date'] = viz_data['date'].dt.strftime('%Y-%m-%d')

# Round numbers to reduce file size
# viz_data['value'] = viz_data['value'].round(2)

# Check size
# print(f"Rows in viz dataset: {len(viz_data)}")
# print(f"Columns: {viz_data.columns.tolist()}")

## 8. Save Dataset for Website

In [None]:
# Save to viz-datasets (this will be committed to git)
viz_datasets_path = Path('../viz-datasets')
viz_datasets_path.mkdir(exist_ok=True)

# Convert to JSON
# output_file = viz_datasets_path / 'your_dataset.json'
# viz_data_dict = viz_data.to_dict('records')

# with open(output_file, 'w') as f:
#     json.dump(viz_data_dict, f, indent=2)

# Check file size
# file_size = output_file.stat().st_size / 1024  # KB
# print(f"Saved: {output_file.name}")
# print(f"File size: {file_size:.2f} KB")
# if file_size > 1000:
#     print("⚠️ Warning: File is larger than 1MB. Consider further reduction.")

## 9. Create Static Visualizations (Optional)

In [None]:
# Create charts for static images
# fig, ax = plt.subplots(figsize=(12, 6))
# viz_data.plot(x='category', y='value', kind='bar', ax=ax)
# plt.title('Your Chart Title')
# plt.xlabel('Category')
# plt.ylabel('Value')
# plt.tight_layout()
# plt.show()

# Save to output folder
# output_path = Path('../output')
# output_path.mkdir(exist_ok=True)
# fig.savefig(output_path / 'chart.png', dpi=300, bbox_inches='tight')
# print("Chart saved to output folder")

## 10. Summary and Next Steps

In [None]:
# Print summary of what was created
# print("="*50)
# print("PROCESSING COMPLETE")
# print("="*50)
# print(f"Input rows: {len(df)}")
# print(f"Output rows: {len(viz_data)}")
# print(f"Reduction: {(1 - len(viz_data)/len(df))*100:.1f}%")
# print("\nNext steps:")
# print("1. Copy JSON from viz-datasets/ to public/src/assets/data/")
# print("2. Import in Vue component and use with D3.js")
# print("3. (Optional) Copy PNG from output/ to public/src/assets/")