# Concatenate Weather Data (1950-1954)

Simple notebook to concatenate weather parquet files for years 1950-1954.


In [1]:
import dask.dataframe as dd
import os
from datetime import datetime

print(f"Starting at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


Starting at 2025-10-17 14:50:27


In [2]:
# Define years and collect file paths
years = [1950, 1951, 1952, 1953, 1954]
data_dir = "data"

file_paths = []
for year in years:
    file_path = os.path.join(data_dir, f"weather_{year}_wide.parquet")
    if os.path.exists(file_path):
        file_paths.append(file_path)
        print(f"✓ Found: {year}")
    else:
        print(f"✗ Missing: {year}")

print(f"\nFound {len(file_paths)} files")


✓ Found: 1950
✓ Found: 1951
✓ Found: 1952
✓ Found: 1953
✓ Found: 1954

Found 5 files


In [3]:
# Load and concatenate all files
print("Loading and concatenating files...")

dataframes = []
for file_path in file_paths:
    df = dd.read_parquet(file_path)
    dataframes.append(df)
    print(f"  Loaded: {file_path}")

# Concatenate all dataframes
df_combined = dd.concat(dataframes, ignore_index=True)
print(f"\nConcatenated: {df_combined.shape[0]:,} rows × {len(df_combined.columns)} columns")


Loading and concatenating files...
  Loaded: data/weather_1950_wide.parquet
  Loaded: data/weather_1951_wide.parquet
  Loaded: data/weather_1952_wide.parquet
  Loaded: data/weather_1953_wide.parquet
  Loaded: data/weather_1954_wide.parquet


TypeError: unsupported format string passed to Delayed.__format__

In [None]:
# Save the combined data
output_file = os.path.join(data_dir, "weather_1950_1954_combined.parquet")
print(f"Saving to: {output_file}")

df_combined.to_parquet(output_file)
print("✓ Saved successfully")

# Check file size
if os.path.exists(output_file):
    file_size = os.path.getsize(output_file) / (1024 * 1024)
    print(f"File size: {file_size:.1f} MB")

print(f"\nCompleted at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


Reference column structure (368 columns):
  ['ID', 'year', 'ELEMENT', 'day_1', 'day_2', 'day_3', 'day_4', 'day_5', 'day_6', 'day_7']...
✓ Dataframe 2 columns match reference
✓ Dataframe 3 columns match reference
✓ Dataframe 4 columns match reference
✓ Dataframe 5 columns match reference

✅ All dataframes have consistent column structure


In [None]:
# Quick verification
print("Verifying saved file...")
verification_df = dd.read_parquet(output_file)
print(f"✓ Verification: {verification_df.shape[0]:,} rows × {len(verification_df.columns)} columns")
print("✓ All done!")


Concatenating 5 dataframes...
Combined dataframe: 365,437 rows × 368 columns
Columns: ['ID', 'year', 'ELEMENT', 'day_1', 'day_2', 'day_3', 'day_4', 'day_5', 'day_6', 'day_7']...
Concatenation completed in 2.0 seconds
