# Qs: Create a daily-updated data archive of observed meteorology:
Stakeholders are Salient's Machine Learning team and our customers
Duration limit to complete the task is a 2 hour timeframe, enforced on the honor system
Deadline to submit an answer is 2 weeks after receipt of this email
For now, the archive will contain 3 different observed met station WBAN codes:
14739 (Boston), 23169 (Las Vegas), 94846 (Chicago)
Eventually, this system must scale to handle all >100k GHCNd stations
Get data from NCEI, example for Boston:
https://www.ncei.noaa.gov/data/global-historical-climatology-network-daily/access/USW000014739.csv

# GET ALL THE LIBRARIES

In [1]:
import requests
import pandas as pd
from datetime import datetime, timedelta
from io import StringIO
import io


  from pandas.core import (


# GET THE URL

In [2]:
BASE_URL = "https://www.ncei.noaa.gov/data/global-historical-climatology-network-daily/access/"

# HELPER FUNCTION TO DOWNLOAD DATA AND TO CHECK FOR UPDATED DATA

In [3]:
def build_ghcnd_archive(station_ids):
    """
    Establishes a fresh GHCND archive from scratch for a list of stations,
    including only specific columns: ghcn_id (STATION), DATE, precip, TMAX, and TMIN.
    """
    for station_id in station_ids:
        url = f"{BASE_URL}{station_id}.csv"
        try:
            response = requests.get(url)
            response.raise_for_status()  # Raise an HTTPError for bad responses

            # Use StringIO to read CSV data from the response text
            df = pd.read_csv(StringIO(response.text))
            
            # Rename 'prcp' to 'precip' if it exists
            if 'PRCP' in df.columns:
                df.rename(columns={'PRCP': 'precip'}, inplace=True)
            
            # Filter columns to only include required ones
            if 'STATION' in df.columns:
                df_filtered = df[['STATION', 'DATE', 'precip', 'TMAX', 'TMIN']]
            else:
                df_filtered = df[['ghcn_id', 'DATE', 'precip', 'TMAX', 'TMIN']]
            
            # Save the filtered DataFrame to a CSV file
            filename = f"{station_id}_ghcnd_archive.csv"
            df_filtered.to_csv(filename, index=False)
            print(f"Archive created successfully: {filename}")

        except requests.RequestException as e:
            print(f"Error: {e} - Unable to create archive for station {station_id}.")
        except pd.errors.EmptyDataError:
            print(f"Error: No data found in the response for station {station_id}.")
        except pd.errors.ParserError:
            print(f"Error: Failed to parse the downloaded file for station {station_id}.")

def update_ghcnd_archive(station_ids):
    """
    Updates the GHCND archive with the latest data for a list of stations.
    """
    for station_id in station_ids:
        filename = f"{station_id}_ghcnd_archive.csv"
        try:
            df_existing = pd.read_csv(filename)
        except FileNotFoundError:
            print(f"Archive not found for station {station_id}. Run build_ghcnd_archive() first.")
            continue
        except pd.errors.EmptyDataError:
            print(f"Error: Archive file for station {station_id} is empty.")
            continue
        except pd.errors.ParserError:
            print(f"Error: Archive file for station {station_id} is corrupted.")
            continue

        # Convert 'DATE' column to datetime
        df_existing['DATE'] = pd.to_datetime(df_existing['DATE'])
        latest_date = df_existing['DATE'].max()
        next_date = latest_date + timedelta(days=1)

        if next_date > datetime.today():
            print(f"Archive is already up-to-date for station {station_id}.")
            continue

        url = f"{BASE_URL}{station_id}.csv"
        try:
            response = requests.get(url)
            response.raise_for_status()

            # Use io.StringIO to read CSV data from the response text
            chunksize = 10**6  # Adjust chunk size as needed
            df_list = []
            
            for chunk in pd.read_csv(io.StringIO(response.text), chunksize=chunksize):
                # Rename 'prcp' to 'precip'
                chunk.rename(columns={'PRCP': 'precip'}, inplace=True)
                
                # Filter columns to only include required ones
                if 'STATION' in chunk.columns:
                    chunk = chunk[['STATION', 'DATE', 'precip', 'TMAX', 'TMIN']]
                else:
                    chunk = chunk[['ghcn_id', 'DATE', 'precip', 'TMAX', 'TMIN']]
                
                # Convert 'DATE' column to datetime
                chunk['DATE'] = pd.to_datetime(chunk['DATE'])
                
                # Filter for new data only
                df_new_filtered = chunk[chunk['DATE'] >= next_date]
                if not df_new_filtered.empty:
                    df_list.append(df_new_filtered)

            if df_list:
                df_new_combined = pd.concat(df_list)
                df_updated = pd.concat([df_existing, df_new_combined])
                df_updated.to_csv(filename, index=False)
                print(f"Archive updated successfully: {filename}")
            else:
                print(f"No new data available for station {station_id}.")

        except requests.RequestException as e:
            print(f"Error: {e} - Unable to update archive for station {station_id}.")
        except pd.errors.EmptyDataError:
            print(f"Error: No data found in the downloaded file for station {station_id}.")
        except pd.errors.ParserError:
            print(f"Error: Failed to parse the downloaded file for station {station_id}.")

# CALL build_ghcnd_archive FUNCTION TO DOWNLOAD DATA

In [4]:
# List of station IDs (example: Boston, New York Central Park, Chicago)
station_ids = ["USW00014739", "USW00094728", "USW00014819"]
# Call function to download data
build_ghcnd_archive(station_ids)  # Run this once to create the archives


  df = pd.read_csv(StringIO(response.text))


Archive created successfully: USW00014739_ghcnd_archive.csv


  df = pd.read_csv(StringIO(response.text))


Archive created successfully: USW00094728_ghcnd_archive.csv
Archive created successfully: USW00014819_ghcnd_archive.csv


# SCHEDULE TASK EVERY HOUR TO CHECK IF DATA IS AVAILABLE

In [5]:
import schedule
import time

def job():
    update_ghcnd_archive(station_ids)

# Schedule the job to run daily
schedule.every().hour.do(job)

while True:
    schedule.run_pending()
    time.sleep(30)  # Wait a minute before checking again