# NOAA GHCN Metadata Exploration

This notebook explores the metadata files from the NOAA Global Historical Climatology Network (GHCN) dataset stored in S3.

**Data Source:** `s3://noaa-ghcn-pds/`

**Metadata Files:**
- `ghcnd-stations.txt` - Weather station locations and info
- `ghcnd-countries.txt` - Country code mappings
- `ghcnd-states.txt` - US state/Canadian province codes
- `ghcnd-inventory.txt` - Data availability per station

In [None]:
# Install required packages if needed
# !pip install pandas s3fs fsspec

In [None]:
import pandas as pd
import s3fs
from io import StringIO

# S3 filesystem (no credentials needed for public bucket)
S3_BUCKET = 's3://noaa-ghcn-pds'
fs = s3fs.S3FileSystem(anon=True)

## 1. Countries Metadata

In [None]:
# Read countries file (format: 2-char code + space + country name)
with fs.open('noaa-ghcn-pds/ghcnd-countries.txt', 'r') as f:
    countries_raw = f.read()

countries = pd.DataFrame([
    {'country_code': line[:2], 'country_name': line[3:].strip()}
    for line in countries_raw.strip().split('\n')
])

print(f"Total countries: {len(countries)}")
countries.head(10)

In [None]:
# View all countries
countries

## 2. States/Provinces Metadata

In [None]:
# Read states file (format: 2-char code + space + state name)
with fs.open('noaa-ghcn-pds/ghcnd-states.txt', 'r') as f:
    states_raw = f.read()

states = pd.DataFrame([
    {'state_code': line[:2], 'state_name': line[3:].strip()}
    for line in states_raw.strip().split('\n')
])

print(f"Total states/provinces: {len(states)}")
states

## 3. Stations Metadata

Fixed-width format:
- Columns 1-11: Station ID
- Columns 13-20: Latitude
- Columns 22-30: Longitude  
- Columns 32-37: Elevation (meters)
- Columns 39-40: State code
- Columns 42-71: Station name
- Columns 73-75: GSN flag
- Columns 77-79: HCN/CRN flag
- Columns 81-85: WMO ID

In [None]:
# Read stations file (fixed-width format)
with fs.open('noaa-ghcn-pds/ghcnd-stations.txt', 'r') as f:
    stations_raw = f.read()

stations = pd.read_fwf(
    StringIO(stations_raw),
    colspecs=[
        (0, 11),    # ID
        (12, 20),   # LATITUDE
        (21, 30),   # LONGITUDE
        (31, 37),   # ELEVATION
        (38, 40),   # STATE
        (41, 71),   # NAME
        (72, 75),   # GSN_FLAG
        (76, 79),   # HCN_CRN_FLAG
        (80, 85)    # WMO_ID
    ],
    names=['station_id', 'latitude', 'longitude', 'elevation', 'state', 'name', 'gsn_flag', 'hcn_crn_flag', 'wmo_id']
)

print(f"Total stations: {len(stations):,}")
stations.head(10)

In [None]:
# Station statistics
print("Station Statistics:")
print(f"  Total stations: {len(stations):,}")
print(f"  Elevation range: {stations['elevation'].min():.1f}m to {stations['elevation'].max():.1f}m")
print(f"  Latitude range: {stations['latitude'].min():.2f} to {stations['latitude'].max():.2f}")
print(f"  Longitude range: {stations['longitude'].min():.2f} to {stations['longitude'].max():.2f}")

In [None]:
# Stations by country (first 2 chars of station_id = country code)
stations['country_code'] = stations['station_id'].str[:2]
stations_by_country = stations.groupby('country_code').size().reset_index(name='station_count')
stations_by_country = stations_by_country.merge(countries, on='country_code', how='left')
stations_by_country = stations_by_country.sort_values('station_count', ascending=False)

print("Top 20 countries by number of stations:")
stations_by_country.head(20)

In [None]:
# US stations by state
us_stations = stations[stations['country_code'] == 'US'].copy()
us_by_state = us_stations.groupby('state').size().reset_index(name='station_count')
us_by_state = us_by_state.merge(states, left_on='state', right_on='state_code', how='left')
us_by_state = us_by_state.sort_values('station_count', ascending=False)

print(f"Total US stations: {len(us_stations):,}")
us_by_state.head(20)

## 4. Inventory Metadata

Shows what data elements are available for each station and the date range.

Fixed-width format:
- Columns 1-11: Station ID
- Columns 13-20: Latitude
- Columns 22-30: Longitude
- Columns 32-35: Element (TMAX, TMIN, PRCP, etc.)
- Columns 37-40: First year
- Columns 42-45: Last year

In [None]:
# Read inventory file (fixed-width format)
with fs.open('noaa-ghcn-pds/ghcnd-inventory.txt', 'r') as f:
    inventory_raw = f.read()

inventory = pd.read_fwf(
    StringIO(inventory_raw),
    colspecs=[
        (0, 11),    # ID
        (12, 20),   # LATITUDE
        (21, 30),   # LONGITUDE
        (31, 35),   # ELEMENT
        (36, 40),   # FIRST_YEAR
        (41, 45)    # LAST_YEAR
    ],
    names=['station_id', 'latitude', 'longitude', 'element', 'first_year', 'last_year']
)

print(f"Total inventory records: {len(inventory):,}")
inventory.head(10)

In [None]:
# Available data elements
elements = inventory.groupby('element').agg(
    station_count=('station_id', 'nunique'),
    earliest_year=('first_year', 'min'),
    latest_year=('last_year', 'max')
).reset_index().sort_values('station_count', ascending=False)

print("Data elements available:")
print("\nCommon elements:")
print("  TMAX = Maximum temperature")
print("  TMIN = Minimum temperature")
print("  PRCP = Precipitation")
print("  SNOW = Snowfall")
print("  SNWD = Snow depth")
print()
elements.head(20)

In [None]:
# Data coverage over time
inventory['years_of_data'] = inventory['last_year'] - inventory['first_year'] + 1

print("Data coverage statistics:")
print(f"  Earliest data: {inventory['first_year'].min()}")
print(f"  Latest data: {inventory['last_year'].max()}")
print(f"  Average years of data per station-element: {inventory['years_of_data'].mean():.1f}")

In [None]:
# Stations with longest records for core elements
core_elements = ['TMAX', 'TMIN', 'PRCP']
long_records = inventory[
    (inventory['element'].isin(core_elements)) & 
    (inventory['years_of_data'] > 100)
]

print(f"Stations with 100+ years of core data: {long_records['station_id'].nunique():,}")
long_records.sort_values('years_of_data', ascending=False).head(20)

## 5. Summary Statistics for Snowflake Loading

Key info to consider when loading into Snowflake:

In [None]:
print("=" * 50)
print("SUMMARY FOR SNOWFLAKE LOADING")
print("=" * 50)
print(f"\nMetadata Tables:")
print(f"  countries:  {len(countries):,} rows")
print(f"  states:     {len(states):,} rows")
print(f"  stations:   {len(stations):,} rows")
print(f"  inventory:  {len(inventory):,} rows")
print(f"\nUnique stations: {stations['station_id'].nunique():,}")
print(f"Unique elements: {inventory['element'].nunique()}")
print(f"Date range: {inventory['first_year'].min()} - {inventory['last_year'].max()}")