# Create Weather DataFrame 2020 with Dask

Creates a wide-format dataframe with weather data from 2020 including TOBS, TMAX, TMIN, PRCP, SNOW, SNWD measurements.
Each row represents a station-year-measurement combination with 365 daily value columns.


In [None]:
import s3fs
import dask.dataframe as dd
import pandas as pd
import numpy as np
from datetime import datetime
from weather_transformation import transform_long_to_wide, display_dataframe_info, save_wide_dataframe

# Import cluster management utilities
from cluster_utils import setup_dask_cluster

# Setup Dask cluster with 20 workers
cluster, client, cleanup_summary = setup_dask_cluster(
    n_workers=20,
    memory_per_worker='4GB',
    dashboard_port=8790,
    worker_port=8791
)

# Configure dask to use the cluster for all operations
import dask
dask.config.set({'scheduler': 'distributed'})

# Note: This will process 76 years Ã— 6 measurements = 456 data collections
# Estimated processing time: 10-30 minutes depending on data availability
print(f"Starting weather data processing at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


In [None]:
# Load weather data for year 2020 using Dask
s3 = s3fs.S3FileSystem(anon=True)
bucket_path = 's3://noaa-ghcn-pds/parquet/by_year/'
measurements = ['TOBS', 'TMAX', 'TMIN', 'PRCP', 'SNOW', 'SNWD']

# Collect all data files (this may take a few minutes)
all_files = []
print("Collecting file paths...")
for year in range(2020,2021):
    for measurement in measurements:
        file_path = f"{bucket_path}YEAR={year}/ELEMENT={measurement}/"
        try:
            files = s3.glob(f"{file_path}*.parquet")
            all_files.extend([f"s3://{f}" for f in files])
        except:
            continue
    if year % 10 == 0:  # Progress indicator
        print(f"  Processed years up to {year}...")

print(f"Found {len(all_files)} parquet files to process")

# Load all data using Dask (lazy loading)
df_long = dd.read_parquet(all_files, storage_options={'anon': True})
print(f"Loaded {len(df_long):,} rows from {df_long['ID'].nunique().compute():,} stations")


In [None]:
df_long.head()


In [None]:
# Transform to wide format using external function
# This replaces the manual transformation code with a reusable function
df_wide = transform_long_to_wide(df_long, aggfunc='mean', fill_missing_days=True)


In [None]:

# Display information about the transformed dataframe
display_dataframe_info(df_wide)


In [None]:

# Display first few rows of the processed dataframe
df_wide.head()


In [None]:
# Save to file using external function
output_file = 'weather_2020_wide.parquet'
save_wide_dataframe(df_wide, output_file)

print(f"\nProcessing completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


## Summary

Dataframe saved as `weather_2020_wide.parquet` with:
- **Index**: `ID.*ELEMENT is TOBS, TMAX, TMIN, PRCP, SNOW, or SNWD)
- **Columns**: `day_1` through `day_365` 
- **Values**: Weather observations (units vary by measurement type)
- **Time Range**: 2020 (1 year)
- **Processed using Dask DataFrames** for efficient handling of large datasets

### Measurement Types:
- **TOBS**: Temperature at observation time (tenths of degrees C)
- **TMAX**: Maximum temperature (tenths of degrees C)  
- **TMIN**: Minimum temperature (tenths of degrees C)
- **PRCP**: Precipitation (tenths of mm)
- **SNOW**: Snowfall (mm)
- **SNWD**: Snow depth (mm)


In [None]:
# Clean up cluster
from cluster_utils import close_dask_cluster
close_dask_cluster(cluster, client)
