In [1]:
import requests

In [2]:
# https://www.ncei.noaa.gov/support/access-data-service-api-user-documentation
requests.get('https://www.ncei.noaa.gov/access/services/data/v1?dataset=daily-summaries&stations=AGE00147708&startDate=2009-12-01&endDate=2010-01-01&includeAttributes=true&format=json').json()

[{'DATE': '2009-12-01',
  'STATION': 'AGE00147708',
  'TAVG_ATTRIBUTES': 'H,,S',
  'PRCP_ATTRIBUTES': ',,S',
  'TMAX': '  145',
  'TAVG': '   99',
  'TMAX_ATTRIBUTES': ',,S',
  'TMIN': '   70',
  'PRCP': '  269',
  'TMIN_ATTRIBUTES': ',,S'},
 {'DATE': '2009-12-02',
  'STATION': 'AGE00147708',
  'TAVG_ATTRIBUTES': 'H,,S',
  'PRCP_ATTRIBUTES': ',,S',
  'TMAX': '  130',
  'TAVG': '   96',
  'TMAX_ATTRIBUTES': ',,S',
  'TMIN': '   75',
  'PRCP': '   41',
  'TMIN_ATTRIBUTES': ',,S'},
 {'DATE': '2009-12-03',
  'STATION': 'AGE00147708',
  'TAVG_ATTRIBUTES': 'H,,S',
  'PRCP_ATTRIBUTES': ',,S',
  'TMAX': '  185',
  'TAVG': '  111',
  'TMAX_ATTRIBUTES': ',,S',
  'TMIN': '   73',
  'PRCP': '    5',
  'TMIN_ATTRIBUTES': ',,S'},
 {'DATE': '2009-12-04',
  'STATION': 'AGE00147708',
  'TAVG_ATTRIBUTES': 'H,,S',
  'PRCP_ATTRIBUTES': ',,S',
  'TMAX': '  162',
  'TAVG': '  124',
  'TMAX_ATTRIBUTES': ',,S',
  'PRCP': '  330'},
 {'DATE': '2009-12-05',
  'STATION': 'AGE00147708',
  'TAVG_ATTRIBUTES': 'H,,S'

In [32]:
import pandas as pd

# https://www.ncdc.noaa.gov/homr/
STATIONS_URL = 'https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt'
def parse_ghcnd_stations_line(l: str):
    """
    In [IV. FORMAT OF "ghcnd-stations.txt"] of
    https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt
    """
    if len(l) == 85:
        return {
            'ID': l[0:11],
            'LATITUDE': float(l[12:20]),
            'LONGITUDE': float(l[21:30]),
            'ELEVATION': float(l[31:37]),
            'STATE': l[38:40],
            'NAME': l[41:71],
            'GSN FLAG': l[72:75],
            'HCN/CRN FLAG': l[76:79],
            'WMO ID': l[80:85],
        }
    raise ValueError(l)

stations_resl = []
for line in requests.get(STATIONS_URL).text.split('\n'):
    if len(line) > 0:
        stations_resl.append(parse_ghcnd_stations_line(line))
STATIONS_DF = pd.DataFrame(stations_resl)

In [33]:
STATIONS_DF.head()

Unnamed: 0,ID,LATITUDE,LONGITUDE,ELEVATION,STATE,NAME,GSN FLAG,HCN/CRN FLAG,WMO ID
0,ACW00011604,17.1167,-61.7833,10.1,,ST JOHNS COOLIDGE FLD,,,
1,ACW00011647,17.1333,-61.7833,19.2,,ST JOHNS,,,
2,AE000041196,25.333,55.517,34.0,,SHARJAH INTER. AIRP,GSN,,41196.0
3,AEM00041194,25.255,55.364,10.4,,DUBAI INTL,,,41194.0
4,AEM00041217,24.433,54.651,26.8,,ABU DHABI INTL,,,41217.0


In [40]:
INVENTORY_URL = 'https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-inventory.txt'
def parse_ghcnd_inventory_line(l: str):
    """
    In [VII. FORMAT OF "ghcnd-inventory.txt"] of
    https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt
    """
    if len(l) == 45:
        return {
            'ID': l[0:11],
            'LATITUDE': float(l[12:20]),
            'LONGITUDE': float(l[21:30]),
            'ELEMENT': l[31:35],
            'FIRSTYEAR': int(l[36:40]),
            'LASTYEAR': int(l[41:45]),
        }
    raise ValueError(l)

inventory_resl = []
for line in requests.get(INVENTORY_URL).text.split('\n'):
    if len(line) > 0:
        inventory_resl.append(parse_ghcnd_inventory_line(line))
INVENTORY_DF = pd.DataFrame(inventory_resl)

In [41]:
INVENTORY_DF.head()

Unnamed: 0,ID,LATITUDE,LONGITUDE,ELEMENT,FIRSTYEAR,LASTYEAR
0,ACW00011604,17.1167,-61.7833,TMAX,1949,1949
1,ACW00011604,17.1167,-61.7833,TMIN,1949,1949
2,ACW00011604,17.1167,-61.7833,PRCP,1949,1949
3,ACW00011604,17.1167,-61.7833,SNOW,1949,1949
4,ACW00011604,17.1167,-61.7833,SNWD,1949,1949


In [42]:
# Toronto
lat=43.700111
lon=-79.416298

In [43]:
from datetime import date
from typing import List, Optional
import numpy as np

def get_closest_valid_station_id(lat: float, lon: float, dt: date) -> str:
    station_inventory = INVENTORY_DF.copy()
    station_inventory = station_inventory.loc[
        (station_inventory['FIRSTYEAR'] <= dt.year)
        & (dt.year <= station_inventory['LASTYEAR'])
    ]
    latcol, loncol = 'LATITUDE', 'LONGITUDE'
    station_inventory['distance'] = station_inventory.apply(
        lambda row: float(np.linalg.norm(
            np.array([row[latcol], row[loncol]]) - np.array([lat, lon])
        )),
        axis=1
    )
    return station_inventory.sort_values('distance').iloc[0]['ID']

In [44]:
get_closest_valid_station_id(lat, lon, date(2000, 1, 1))

'CA006158350'

In [45]:
STATIONS_DF.loc[STATIONS_DF.ID == 'CA006158350']

Unnamed: 0,ID,LATITUDE,LONGITUDE,ELEVATION,STATE,NAME,GSN FLAG,HCN/CRN FLAG,WMO ID
29662,CA006158350,43.6667,-79.4,113.0,ON,TORONTO,,,71266


In [46]:
# https://www.ncei.noaa.gov/support/access-data-service-api-user-documentation
requests.get(f'https://www.ncei.noaa.gov/access/services/data/v1?dataset=daily-summaries&stations=CA006158350&startDate=2009-12-01&endDate=2010-01-01&includeAttributes=true&format=json').json()

[{'DATE': '2009-12-01',
  'STATION': 'CA006158350',
  'SNOW': '    0',
  'PRCP_ATTRIBUTES': ',,C',
  'SNOW_ATTRIBUTES': ',,C',
  'PRCP': '    0',
  'SNWD_ATTRIBUTES': ',,C',
  'SNWD': '    0'},
 {'DATE': '2009-12-02',
  'STATION': 'CA006158350',
  'SNOW': '    0',
  'PRCP_ATTRIBUTES': ',,C',
  'SNOW_ATTRIBUTES': ',,C',
  'PRCP': '  214',
  'SNWD_ATTRIBUTES': ',,C',
  'SNWD': '    0'},
 {'DATE': '2009-12-03',
  'STATION': 'CA006158350',
  'SNOW': '    0',
  'PRCP_ATTRIBUTES': ',,C',
  'SNOW_ATTRIBUTES': ',,C',
  'PRCP': '   14',
  'SNWD_ATTRIBUTES': ',,C',
  'SNWD': '    0'},
 {'DATE': '2009-12-04',
  'STATION': 'CA006158350',
  'SNOW': '    0',
  'PRCP_ATTRIBUTES': ',,C',
  'SNOW_ATTRIBUTES': ',,C',
  'PRCP': '    0',
  'SNWD_ATTRIBUTES': ',,C',
  'SNWD': '    0'},
 {'DATE': '2009-12-05',
  'STATION': 'CA006158350',
  'SNOW': '    0',
  'PRCP_ATTRIBUTES': ',,C',
  'SNOW_ATTRIBUTES': ',,C',
  'PRCP': '    0',
  'SNWD_ATTRIBUTES': ',,C',
  'SNWD': '    0'},
 {'DATE': '2009-12-06',
  'STA