# Setup
```bash
pip install s3fs pandas dask
```

In [1]:
import s3fs, pandas as pd, dask.dataframe as dd

# S3 Connection

In [2]:
s3 = s3fs.S3FileSystem(anon=True)
bucket = 'noaa-ghcn-pds'
print(f"Testing bucket: {bucket}")

Testing bucket: noaa-ghcn-pds


# Bucket Structure

In [3]:
print(s3.ls(bucket))

['noaa-ghcn-pds/csv', 'noaa-ghcn-pds/csv.gz', 'noaa-ghcn-pds/ghcnd-countries.txt', 'noaa-ghcn-pds/ghcnd-inventory.txt', 'noaa-ghcn-pds/ghcnd-states.txt', 'noaa-ghcn-pds/ghcnd-stations.txt', 'noaa-ghcn-pds/ghcnd-version.txt', 'noaa-ghcn-pds/index.html', 'noaa-ghcn-pds/mingle-list.txt', 'noaa-ghcn-pds/parquet', 'noaa-ghcn-pds/readme-by_station.txt', 'noaa-ghcn-pds/readme-by_year.txt', 'noaa-ghcn-pds/readme.txt', 'noaa-ghcn-pds/status-by_station.txt', 'noaa-ghcn-pds/status-by_year.txt', 'noaa-ghcn-pds/status.txt', 'noaa-ghcn-pds/test.txt']


In [13]:
print(s3.ls(f'{bucket}/parquet/by_year/YEAR=2020/'))

['noaa-ghcn-pds/parquet/by_year/YEAR=2020/ELEMENT=ADPT', 'noaa-ghcn-pds/parquet/by_year/YEAR=2020/ELEMENT=ASLP', 'noaa-ghcn-pds/parquet/by_year/YEAR=2020/ELEMENT=ASTP', 'noaa-ghcn-pds/parquet/by_year/YEAR=2020/ELEMENT=AWBT', 'noaa-ghcn-pds/parquet/by_year/YEAR=2020/ELEMENT=AWDR', 'noaa-ghcn-pds/parquet/by_year/YEAR=2020/ELEMENT=AWND', 'noaa-ghcn-pds/parquet/by_year/YEAR=2020/ELEMENT=DAPR', 'noaa-ghcn-pds/parquet/by_year/YEAR=2020/ELEMENT=DASF', 'noaa-ghcn-pds/parquet/by_year/YEAR=2020/ELEMENT=DATN', 'noaa-ghcn-pds/parquet/by_year/YEAR=2020/ELEMENT=DATX', 'noaa-ghcn-pds/parquet/by_year/YEAR=2020/ELEMENT=DWPR', 'noaa-ghcn-pds/parquet/by_year/YEAR=2020/ELEMENT=EVAP', 'noaa-ghcn-pds/parquet/by_year/YEAR=2020/ELEMENT=MDPR', 'noaa-ghcn-pds/parquet/by_year/YEAR=2020/ELEMENT=MDSF', 'noaa-ghcn-pds/parquet/by_year/YEAR=2020/ELEMENT=MDTN', 'noaa-ghcn-pds/parquet/by_year/YEAR=2020/ELEMENT=MDTX', 'noaa-ghcn-pds/parquet/by_year/YEAR=2020/ELEMENT=MNPN', 'noaa-ghcn-pds/parquet/by_year/YEAR=2020/ELEMEN

# Parquet Files


In [5]:
years = s3.ls(f'{bucket}/parquet/by_year')
print(f"Years: {len(years)}")
print(f"First: {years[0].split('/')[-1]}")
print(f"Last: {years[-1].split('/')[-1]}")


Years: 263
First: YEAR=1750
Last: YEAR=2025


# Column Meanings (from README)

**Core Elements:**
- **PRCP**: Precipitation (tenths of mm)
- **SNOW**: Snowfall (mm) 
- **SNWD**: Snow depth (mm)
- **TMAX**: Maximum temperature (tenths of degrees C)
- **TMIN**: Minimum temperature (tenths of degrees C)

**Temperature Elements:**
- **TAVG**: Average daily temperature (tenths of degrees C)
- **TOBS**: Temperature at time of observation (tenths of degrees C)
- **TAXN**: Average daily temperature computed as (TMAX+TMIN)/2.0 (tenths of degrees C)

**Wind Elements:**
- **AWND**: Average daily wind speed (tenths of meters per second)
- **WSF1**: Fastest 1-minute wind speed (tenths of meters per second)
- **WSF2**: Fastest 2-minute wind speed (tenths of meters per second)
- **WSFG**: Peak gust wind speed (tenths of meters per second)

**Pressure Elements:**
- **ASLP**: Average Sea Level Pressure (hPa * 10)
- **ASTP**: Average Station Level Pressure (hPa * 10)

**Other Elements:**
- **AWDR**: Average daily wind direction (degrees)
- **EVAP**: Evaporation of water from evaporation pan (tenths of mm)
- **TSUN**: Daily total sunshine (minutes)
- **PSUN**: Daily percent of possible sunshine (percent)

**Flags:**
- **MFLAG**: Measurement flag (B=12hr totals, D=6hr totals, H=hourly, etc.)
- **QFLAG**: Quality flag (D=duplicate, G=gap, I=internal consistency, etc.)
- **SFLAG**: Source flag (0=US Cooperative, A=ASOS, S=Global Summary, etc.)


In [7]:
# Test S3 connection
print("Testing S3 connection...")
test_files = s3.ls(f'{bucket}/parquet/by_year/')[:5]
print(f"S3 working. Years: {[f.split('/')[-1] for f in test_files]}")


Testing S3 connection...
S3 working. Years: ['YEAR=1750', 'YEAR=1763', 'YEAR=1764', 'YEAR=1765', 'YEAR=1766']


In [14]:
# Explore S3 parquet by_year directory structure
print("Exploring S3 parquet by_year structure...")
print("=" * 50)

# Get all year directories
years = s3.ls(f'{bucket}/parquet/by_year/')
print(f"Total years available: {len(years)}")
print(f"Year range: {years[0].split('/')[-1]} to {years[-1].split('/')[-1]}")

# Explore a recent year (2020) structure
print(f"\nExploring 2020 structure:")
year_2020 = f'{bucket}/parquet/by_year/YEAR=2020/'
elements_2020 = s3.ls(year_2020)
print(f"Elements in 2020: {len(elements_2020)}")

# Show first 10 elements
print(f"\nFirst 10 elements in 2020:")
for i, elem in enumerate(elements_2020[:10]):
    element_name = elem.split('/')[-1].replace('ELEMENT=', '')
    print(f"  {i+1:2d}. {element_name}")

# Explore one element in detail
if elements_2020:
    sample_element = elements_2020[0]  # First element
    element_name = sample_element.split('/')[-1].replace('ELEMENT=', '')
    print(f"\nExploring {element_name} in detail:")
    
    # Get files in this element
    element_files = s3.ls(sample_element)
    print(f"Files in {element_name}: {len(element_files)}")
    
    # Show file sizes
    print(f"File sizes:")
    for i, file_path in enumerate(element_files[:5]):  # Show first 5 files
        try:
            file_info = s3.info(file_path)
            size_mb = file_info['size'] / (1024 * 1024)
            filename = file_path.split('/')[-1]
            print(f"  {i+1}. {filename[:30]:30s} ({size_mb:6.2f} MB)")
        except:
            print(f"  {i+1}. {file_path.split('/')[-1][:30]:30s} (size unknown)")

print(f"\nDirectory structure summary:")
print(f"- Years: {len(years)}")
print(f"- Elements per year: ~{len(elements_2020)}")
print(f"- Files per element: ~{len(element_files) if elements_2020 else 'unknown'}")


Exploring S3 parquet by_year structure...
Total years available: 263
Year range: YEAR=1750 to YEAR=2025

Exploring 2020 structure:
Elements in 2020: 74

First 10 elements in 2020:
   1. ADPT
   2. ASLP
   3. ASTP
   4. AWBT
   5. AWDR
   6. AWND
   7. DAPR
   8. DASF
   9. DATN
  10. DATX

Exploring ADPT in detail:
Files in ADPT: 1
File sizes:
  1. 6f6332297eb3486fae9c856b0d7fab (  0.18 MB)

Directory structure summary:
- Years: 263
- Elements per year: ~74
- Files per element: ~1


In [18]:
# Explore parquet file schema
print("Exploring parquet file schema...")
print("=" * 40)

# Configure Dask to use the correct backend
import dask
import dask.dataframe as dd

# Try different backend configurations
try:
    # Option 1: Use dask.dataframe directly (default backend)
    print("Using default Dask DataFrame backend...")
except:
    pass

# Check available backends
print(f"Available backends: {dask.dataframe._backends}")
print(f"Current backend: {dask.dataframe._backend}")

try:
    # First, check what files are actually available in 2020
    year_2020 = f'{bucket}/parquet/by_year/YEAR=2020/'
    elements_2020 = s3.ls(year_2020)
    print(f"Available elements in 2020: {len(elements_2020)}")
    print(f"First 5 elements: {[elem.split('/')[-1] for elem in elements_2020[:5]]}")
    
    # Try to load from a specific element directory
    if elements_2020:
        # Use the first available element
        sample_element = elements_2020[0]
        element_name = sample_element.split('/')[-1].replace('ELEMENT=', '')
        print(f"\nTrying to load {element_name} data...")
        
        # Get files in this element
        element_files = s3.ls(sample_element)
        print(f"Files in {element_name}: {len(element_files)}")
        
        if element_files:
            # Try loading the first file directly with explicit Dask configuration
            first_file = element_files[0]
            print(f"Loading file: {first_file}")
            
            # Load using Dask with simplified approach
            print("Attempting to load with Dask...")
            try:
                # Try with s3fs filesystem
                df_sample = dd.read_parquet(first_file, storage_options={'anon': True})
            except Exception as e1:
                print(f"Dask with s3fs failed: {e1}")
                # Try with direct S3 URL
                s3_url = f"s3://{first_file}"
                print(f"Trying direct S3 URL: {s3_url}")
                df_sample = dd.read_parquet(s3_url, storage_options={'anon': True})
            print(f"✓ Loaded sample: {len(df_sample):,} records, {df_sample.npartitions} partitions")
            
            # Show schema
            print(f"\nSchema:")
            print(f"Columns: {list(df_sample.columns)}")
            print(f"Data types:")
            print(df_sample.dtypes)
            
            # Show sample data
            print(f"\nSample data (first 3 rows):")
            sample_data = df_sample.head(3).compute()
            print(sample_data)
            
        else:
            print(f"No files found in {element_name}")
    else:
        print("No elements found in 2020 directory")
        
except Exception as e:
    print(f"✗ Error loading parquet sample: {e}")
    print("Trying alternative approach with different engine...")
    
    # Try with different approaches
    try:
        if elements_2020 and element_files:
            print("Trying alternative approaches...")
            
            # Try 1: Use pandas directly with s3fs
            try:
                print("Trying pandas with s3fs...")
                import pandas as pd
                with s3.open(first_file, 'rb') as f:
                    df_pandas = pd.read_parquet(f)
                print(f"✓ Pandas load successful: {len(df_pandas):,} records")
                print(f"Columns: {list(df_pandas.columns)}")
                print(f"Sample:")
                print(df_pandas.head(3))
                
                # Convert to Dask for consistency
                df_alt = dd.from_pandas(df_pandas, npartitions=1)
                print(f"✓ Converted to Dask: {len(df_alt):,} records")
                
            except Exception as e_pandas:
                print(f"Pandas approach failed: {e_pandas}")
                
                # Try 2: Different Dask engine
                print("Trying Dask with fastparquet...")
                df_alt = dd.read_parquet(
                    first_file,
                    storage_options={'anon': True},
                    engine='fastparquet'
                )
                print(f"✓ Fastparquet load successful: {len(df_alt):,} records")
                print(f"Columns: {list(df_alt.columns)}")
                print(f"Sample: {df_alt.head(3).compute()}")
                
    except Exception as e2:
        print(f"✗ All alternative approaches failed: {e2}")
        print("Trying to explore directory structure instead...")
        
        # Fallback: just explore the directory structure
        try:
            print(f"Exploring {bucket}/parquet/by_year/ structure...")
            years = s3.ls(f'{bucket}/parquet/by_year/')
            print(f"Available years: {len(years)}")
            
            if years:
                # Try to list files in a recent year
                recent_year = years[-1]  # Last year
                print(f"Exploring {recent_year}...")
                year_elements = s3.ls(recent_year)
                print(f"Elements in {recent_year.split('/')[-1]}: {len(year_elements)}")
                
                if year_elements:
                    first_element = year_elements[0]
                    print(f"First element: {first_element}")
                    element_files = s3.ls(first_element)
                    print(f"Files in element: {len(element_files)}")
                    if element_files:
                        print(f"First file: {element_files[0]}")
        except Exception as e3:
            print(f"✗ Directory exploration failed: {e3}")


Exploring parquet file schema...
Using default Dask DataFrame backend...


AttributeError: module 'dask.dataframe' has no attribute '_backends'

In [19]:
# CORRECTED: Explore parquet file schema using working approach
print("Exploring parquet file schema (corrected approach)...")
print("=" * 50)

try:
    # Use the EXACT same approach as the working process_multiple_years.py
    # Setup S3 filesystem
    s3 = s3fs.S3FileSystem(anon=True)
    bucket_path = 's3://noaa-ghcn-pds/parquet/by_year/'
    
    # Get files for 2020 TOBS (same as working code)
    year = 2020
    measurement = 'TOBS'
    file_path = f"{bucket_path}YEAR={year}/ELEMENT={measurement}/"
    
    print(f"Looking for files in: {file_path}")
    
    # Use s3.glob to find parquet files (same as working code)
    files = s3.glob(f"{file_path}*.parquet")
    print(f"Found {len(files)} parquet files")
    
    if files:
        # Convert to s3:// URLs (same as working code)
        all_files = [f"s3://{f}" for f in files]
        print(f"File URLs: {all_files[:3]}...")  # Show first 3
        
        # Load using the same approach as working code
        print("Loading with Dask (same approach as process_multiple_years.py)...")
        df_sample = dd.read_parquet(all_files, storage_options={'anon': True})
        
        print(f"✓ Loaded sample: {len(df_sample):,} records, {df_sample.npartitions} partitions")
        
        # Show schema
        print(f"\nSchema:")
        print(f"Columns: {list(df_sample.columns)}")
        print(f"Data types:")
        print(df_sample.dtypes)
        
        # Show sample data
        print(f"\nSample data (first 3 rows):")
        sample_data = df_sample.head(3).compute()
        print(sample_data)
        
        # Show data statistics
        print(f"\nData statistics:")
        print(f"Date range: {df_sample['DATE'].min().compute()} to {df_sample['DATE'].max().compute()}")
        print(f"Unique stations: {df_sample['ID'].nunique().compute():,}")
        print(f"Value range: {df_sample['DATA_VALUE'].min().compute():.1f} to {df_sample['DATA_VALUE'].max().compute():.1f}")
        
    else:
        print("No parquet files found for 2020 TOBS")
        
except Exception as e:
    print(f"✗ Error loading parquet sample: {e}")
    print("Trying alternative measurement...")
    
    # Try with a different measurement
    try:
        measurement = 'PRCP'
        file_path = f"{bucket_path}YEAR={year}/ELEMENT={measurement}/"
        files = s3.glob(f"{file_path}*.parquet")
        
        if files:
            all_files = [f"s3://{f}" for f in files]
            print(f"Trying {measurement} data: {len(files)} files")
            df_alt = dd.read_parquet(all_files, storage_options={'anon': True})
            print(f"✓ Alternative load successful: {len(df_alt):,} records")
            print(f"Columns: {list(df_alt.columns)}")
            print(f"Sample: {df_alt.head(3).compute()}")
        else:
            print(f"No files found for {measurement}")
            
    except Exception as e2:
        print(f"✗ Alternative measurement failed: {e2}")


Exploring parquet file schema (corrected approach)...
Looking for files in: s3://noaa-ghcn-pds/parquet/by_year/YEAR=2020/ELEMENT=TOBS/
Found 4 parquet files
File URLs: ['s3://noaa-ghcn-pds/parquet/by_year/YEAR=2020/ELEMENT=TOBS/6f6332297eb3486fae9c856b0d7fab49_0.snappy.parquet', 's3://noaa-ghcn-pds/parquet/by_year/YEAR=2020/ELEMENT=TOBS/6f6332297eb3486fae9c856b0d7fab49_1.snappy.parquet', 's3://noaa-ghcn-pds/parquet/by_year/YEAR=2020/ELEMENT=TOBS/6f6332297eb3486fae9c856b0d7fab49_2.snappy.parquet']...
Loading with Dask (same approach as process_multiple_years.py)...
✗ Error loading parquet sample: No backend dispatch registered for dask
Trying alternative measurement...
Trying PRCP data: 23 files
✗ Alternative measurement failed: No backend dispatch registered for dask
