In [22]:
token = "YAvQVopzQifTCqEDAYkIsFexHiCisBIB"

In [24]:
import pandas as pd
import requests
import time
import logging
from datetime import datetime
from typing import Dict, List, Optional

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

class NOAAWeatherCollector:
    def __init__(self, token: str, start_year: int = 2020, end_year: int = 2022):
        self.base_url = "https://www.ncdc.noaa.gov/cdo-web/api/v2"
        self.headers = {"token": token}
        self.start_year = start_year
        self.end_year = end_year

    def get_weather_data(self, station_id: str, state: str) -> List[Dict]:
        """Get weather data for a station"""
        all_data = []
        
        for year in range(self.start_year, self.end_year + 1):
            logging.info(f"Fetching {year} data for station {station_id}")
            
            params = {
                "datasetid": "GHCND",
                "stationid": station_id,
                "startdate": f"{year}-01-01",
                "enddate": f"{year}-12-31",
                "limit": 1000,
                "datatypeid": "TMAX,TMIN,PRCP"  # Temperature max/min and precipitation
            }
            
            try:
                response = requests.get(
                    f"{self.base_url}/data",
                    headers=self.headers,
                    params=params
                )
                
                if response.status_code == 200:
                    data = response.json()
                    if 'results' in data:
                        for record in data['results']:
                            all_data.append({
                                'station_id': station_id,
                                'state': state,
                                'date': record['date'],
                                'datatype': record['datatype'],
                                'value': record['value']
                            })
                elif response.status_code == 429:  # Rate limit
                    wait_time = int(response.headers.get('Retry-After', 60))
                    logging.warning(f"Rate limit hit. Waiting {wait_time} seconds...")
                    time.sleep(wait_time)
                else:
                    logging.error(f"Error fetching data: {response.status_code}")
                    
            except Exception as e:
                logging.error(f"Error processing station {station_id}: {str(e)}")
            
            time.sleep(1)  # Prevent rate limiting
            
        return all_data

def process_weather_data(df: pd.DataFrame) -> pd.DataFrame:
    """Process weather data into yearly averages by state"""
    # Convert date to datetime
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    
    # Convert values (temperature in tenths of degrees C, precipitation in tenths of mm)
    df['value'] = df['value'].astype(float)
    df.loc[df['datatype'].isin(['TMAX', 'TMIN']), 'value'] = df.loc[df['datatype'].isin(['TMAX', 'TMIN']), 'value'] / 10
    df.loc[df['datatype'] == 'PRCP', 'value'] = df.loc[df['datatype'] == 'PRCP', 'value'] / 10
    
    # Calculate yearly averages by state
    grouped = df.pivot_table(
        index=['state', 'year'],
        columns='datatype',
        values='value',
        aggfunc={
            'value': {
                'TMAX': 'mean',
                'TMIN': 'mean',
                'PRCP': 'sum'
            }
        }
    ).reset_index()
    
    # Rename columns
    grouped.columns = ['state', 'year', 'avg_max_temp_c', 'avg_min_temp_c', 'total_precip_mm']
    
    return grouped

def main():
    # Read stations from CSV
    stations_df = pd.read_csv('us_weather_stations.csv')
    
    # Initialize collector
    token = "YAvQVopzQifTCqEDAYkIsFexHiCisBIB"
    collector = NOAAWeatherCollector(token=token)
    
    all_weather_data = []
    
    # Process a sample of stations first (2 per state)
    sample_stations = stations_df.groupby('state').head(2)
    
    for _, station in sample_stations.iterrows():
        try:
            station_data = collector.get_weather_data(
                station_id=station['id'],
                state=station['state']
            )
            all_weather_data.extend(station_data)
            logging.info(f"Collected {len(station_data)} records for {station['id']}")
            
        except Exception as e:
            logging.error(f"Error processing station {station['id']}: {str(e)}")
            continue
    
    if all_weather_data:
        # Convert to DataFrame
        weather_df = pd.DataFrame(all_weather_data)
        
        # Save raw data
        weather_df.to_csv('weather_data_raw.csv', index=False)
        logging.info("Saved raw weather data")
        
        # Process and save aggregated data
        state_yearly_df = process_weather_data(weather_df)
        state_yearly_df.to_csv('weather_data_by_state.csv', index=False)
        logging.info("Saved processed weather data")
        
        # Display sample of results
        print("\nSample of processed weather data:")
        print(state_yearly_df.head())
    else:
        logging.error("No weather data collected")

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        logging.error(f"An error occurred: {str(e)}")
        raise

2024-11-27 16:47:47,984 - INFO - Fetching 2020 data for station GHCND:USC00010063
2024-11-27 16:47:50,568 - INFO - Fetching 2021 data for station GHCND:USC00010063
2024-11-27 16:47:52,105 - INFO - Fetching 2022 data for station GHCND:USC00010063
2024-11-27 16:48:22,255 - ERROR - Error fetching data: 503
2024-11-27 16:48:23,262 - INFO - Collected 1499 records for GHCND:USC00010063
2024-11-27 16:48:23,275 - INFO - Fetching 2020 data for station GHCND:USC00010148
2024-11-27 16:48:24,832 - INFO - Fetching 2021 data for station GHCND:USC00010148
2024-11-27 16:48:26,264 - INFO - Fetching 2022 data for station GHCND:USC00010148
2024-11-27 16:48:27,722 - INFO - Collected 0 records for GHCND:USC00010148
2024-11-27 16:48:27,725 - INFO - Fetching 2020 data for station GHCND:CA001206197
2024-11-27 16:48:29,387 - INFO - Fetching 2021 data for station GHCND:CA001206197
2024-11-27 16:48:31,433 - INFO - Fetching 2022 data for station GHCND:CA001206197
2024-11-27 16:48:32,764 - INFO - Collected 1957 re

AttributeError: module 'numpy' has no attribute 'matrix'