In [1]:
import requests

In [2]:
# https://www.ncei.noaa.gov/support/access-data-service-api-user-documentation
requests.get('https://www.ncei.noaa.gov/access/services/data/v1?dataset=daily-summaries&stations=AGE00147708&startDate=2009-12-01&endDate=2010-01-01&includeAttributes=true&format=json').json()

[{'DATE': '2009-12-01',
  'STATION': 'AGE00147708',
  'TAVG_ATTRIBUTES': 'H,,S',
  'PRCP_ATTRIBUTES': ',,S',
  'TMAX': '  145',
  'TAVG': '   99',
  'TMAX_ATTRIBUTES': ',,S',
  'TMIN': '   70',
  'PRCP': '  269',
  'TMIN_ATTRIBUTES': ',,S'},
 {'DATE': '2009-12-02',
  'STATION': 'AGE00147708',
  'TAVG_ATTRIBUTES': 'H,,S',
  'PRCP_ATTRIBUTES': ',,S',
  'TMAX': '  130',
  'TAVG': '   96',
  'TMAX_ATTRIBUTES': ',,S',
  'TMIN': '   75',
  'PRCP': '   41',
  'TMIN_ATTRIBUTES': ',,S'},
 {'DATE': '2009-12-03',
  'STATION': 'AGE00147708',
  'TAVG_ATTRIBUTES': 'H,,S',
  'PRCP_ATTRIBUTES': ',,S',
  'TMAX': '  185',
  'TAVG': '  111',
  'TMAX_ATTRIBUTES': ',,S',
  'TMIN': '   73',
  'PRCP': '    5',
  'TMIN_ATTRIBUTES': ',,S'},
 {'DATE': '2009-12-04',
  'STATION': 'AGE00147708',
  'TAVG_ATTRIBUTES': 'H,,S',
  'PRCP_ATTRIBUTES': ',,S',
  'TMAX': '  162',
  'TAVG': '  124',
  'TMAX_ATTRIBUTES': ',,S',
  'PRCP': '  330'},
 {'DATE': '2009-12-05',
  'STATION': 'AGE00147708',
  'TAVG_ATTRIBUTES': 'H,,S'

In [32]:
import pandas as pd

# https://www.ncdc.noaa.gov/homr/
STATIONS_URL = 'https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt'
def parse_ghcnd_stations_line(l: str):
    """
    In [IV. FORMAT OF "ghcnd-stations.txt"] of
    https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt
    """
    if len(l) == 85:
        return {
            'ID': l[0:11],
            'LATITUDE': float(l[12:20]),
            'LONGITUDE': float(l[21:30]),
            'ELEVATION': float(l[31:37]),
            'STATE': l[38:40],
            'NAME': l[41:71],
            'GSN FLAG': l[72:75],
            'HCN/CRN FLAG': l[76:79],
            'WMO ID': l[80:85],
        }
    raise ValueError(l)

stations_resl = []
for line in requests.get(STATIONS_URL).text.split('\n'):
    if len(line) > 0:
        stations_resl.append(parse_ghcnd_stations_line(line))
STATIONS_DF = pd.DataFrame(stations_resl)

In [33]:
STATIONS_DF.head()

Unnamed: 0,ID,LATITUDE,LONGITUDE,ELEVATION,STATE,NAME,GSN FLAG,HCN/CRN FLAG,WMO ID
0,ACW00011604,17.1167,-61.7833,10.1,,ST JOHNS COOLIDGE FLD,,,
1,ACW00011647,17.1333,-61.7833,19.2,,ST JOHNS,,,
2,AE000041196,25.333,55.517,34.0,,SHARJAH INTER. AIRP,GSN,,41196.0
3,AEM00041194,25.255,55.364,10.4,,DUBAI INTL,,,41194.0
4,AEM00041217,24.433,54.651,26.8,,ABU DHABI INTL,,,41217.0


In [40]:
INVENTORY_URL = 'https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-inventory.txt'
def parse_ghcnd_inventory_line(l: str):
    """
    In [VII. FORMAT OF "ghcnd-inventory.txt"] of
    https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt
    """
    if len(l) == 45:
        return {
            'ID': l[0:11],
            'LATITUDE': float(l[12:20]),
            'LONGITUDE': float(l[21:30]),
            'ELEMENT': l[31:35],
            'FIRSTYEAR': int(l[36:40]),
            'LASTYEAR': int(l[41:45]),
        }
    raise ValueError(l)

inventory_resl = []
for line in requests.get(INVENTORY_URL).text.split('\n'):
    if len(line) > 0:
        inventory_resl.append(parse_ghcnd_inventory_line(line))
INVENTORY_DF = pd.DataFrame(inventory_resl)

In [41]:
INVENTORY_DF.head()

Unnamed: 0,ID,LATITUDE,LONGITUDE,ELEMENT,FIRSTYEAR,LASTYEAR
0,ACW00011604,17.1167,-61.7833,TMAX,1949,1949
1,ACW00011604,17.1167,-61.7833,TMIN,1949,1949
2,ACW00011604,17.1167,-61.7833,PRCP,1949,1949
3,ACW00011604,17.1167,-61.7833,SNOW,1949,1949
4,ACW00011604,17.1167,-61.7833,SNWD,1949,1949


In [47]:
INVENTORY_DF.ELEMENT.unique()

array(['TMAX', 'TMIN', 'PRCP', 'SNOW', 'SNWD', 'PGTM', 'WDFG', 'WSFG',
       'WT03', 'WT08', 'WT16', 'TAVG', 'DAPR', 'MDPR', 'TOBS', 'WT01',
       'WT04', 'WT05', 'WDMV', 'WT06', 'WT11', 'ACMH', 'ACSH', 'AWND',
       'DAEV', 'DAWM', 'EVAP', 'FMTM', 'MDEV', 'MDWM', 'MNPN', 'MXPN',
       'PSUN', 'TSUN', 'WDF1', 'WDF2', 'WDF5', 'WDFM', 'WSF1', 'WSF2',
       'WSF5', 'WSFM', 'WT10', 'WT13', 'WT14', 'WV20', 'DWPR', 'DATX',
       'MDTX', 'DATN', 'MDTN', 'WESD', 'WT07', 'WT09', 'WT18', 'WT02',
       'WT12', 'WT17', 'WT22', 'WT19', 'THIC', 'MDSF', 'WESF', 'FRGB',
       'FRGT', 'GAHT', 'WT21', 'WT15', 'WV01', 'WV03', 'WSFI', 'DASF',
       'SN01', 'SN02', 'SN03', 'SX01', 'SX02', 'SX03', 'SN32', 'SX32',
       'SX22', 'SN52', 'SX52', 'SN11', 'SN12', 'SN13', 'SN51', 'SX11',
       'SX12', 'SX13', 'SX51', 'SN31', 'SX31', 'SN33', 'SN35', 'SX33',
       'SX35', 'SN53', 'SN55', 'SX53', 'SX55', 'SN54', 'SX54', 'SN34',
       'SN36', 'SX34', 'SX36', 'SN22', 'SN72', 'SN81', 'SN82', 'SN83',
      

In [42]:
# Toronto
lat=43.700111
lon=-79.416298

In [66]:
from datetime import date
from typing import List, Optional
import numpy as np

def get_closest_valid_station_id(lat: float, lon: float, dt: date) -> str:
    station_inventory = INVENTORY_DF.copy()
    station_inventory = station_inventory.loc[
        (station_inventory['FIRSTYEAR'] <= dt.year)
        & (dt.year <= station_inventory['LASTYEAR'])
    ]
    station_inventory = station_inventory.groupby('ID').agg({
        'LATITUDE': 'first',
        'LONGITUDE': 'first',
        'ELEMENT': lambda x: set(x)
    }).reset_index()
    station_inventory = station_inventory.loc[station_inventory['ELEMENT'].apply(
        lambda x: all([v in x for v in ['TMAX', 'TMIN', 'PRCP']])
    )]
    latcol, loncol = 'LATITUDE', 'LONGITUDE'
    station_inventory['distance'] = station_inventory.apply(
        lambda row: float(np.linalg.norm(
            np.array([row[latcol], row[loncol]]) - np.array([lat, lon])
        )),
        axis=1
    )
    return station_inventory.sort_values('distance').iloc[0]['ID']

In [67]:
get_closest_valid_station_id(lat, lon, date(2000, 1, 1))

'CA006158350'

In [69]:
INVENTORY_DF[INVENTORY_DF.ID == 'CA006158350']

Unnamed: 0,ID,LATITUDE,LONGITUDE,ELEMENT,FIRSTYEAR,LASTYEAR
104323,CA006158350,43.6667,-79.4,TMAX,1840,2003
104324,CA006158350,43.6667,-79.4,TMIN,1840,2003
104325,CA006158350,43.6667,-79.4,PRCP,1840,2017
104326,CA006158350,43.6667,-79.4,SNOW,1840,2017
104327,CA006158350,43.6667,-79.4,SNWD,1955,2017
104328,CA006158350,43.6667,-79.4,MDPR,1842,1844


In [70]:
STATIONS_DF.loc[STATIONS_DF.ID == 'CA006158350']

Unnamed: 0,ID,LATITUDE,LONGITUDE,ELEVATION,STATE,NAME,GSN FLAG,HCN/CRN FLAG,WMO ID
29662,CA006158350,43.6667,-79.4,113.0,ON,TORONTO,,,71266


In [71]:
# https://www.ncei.noaa.gov/support/access-data-service-api-user-documentation
requests.get(f'https://www.ncei.noaa.gov/access/services/data/v1?dataset=daily-summaries&stations=CA006158350&startDate=1999-12-01&endDate=2000-01-01&includeAttributes=true&format=json').json()

[{'DATE': '1999-12-01',
  'STATION': 'CA006158350',
  'SNOW': '    0',
  'PRCP_ATTRIBUTES': ',,C',
  'TMAX': '   36',
  'SNOW_ATTRIBUTES': ',,C',
  'TMAX_ATTRIBUTES': ',,C',
  'TMIN': '  -42',
  'PRCP': '    0',
  'SNWD_ATTRIBUTES': ',,C',
  'SNWD': '    0',
  'TMIN_ATTRIBUTES': ',,C'},
 {'DATE': '1999-12-02',
  'STATION': 'CA006158350',
  'SNOW': '    0',
  'PRCP_ATTRIBUTES': 'T,,C',
  'TMAX': '   86',
  'SNOW_ATTRIBUTES': ',,C',
  'TMAX_ATTRIBUTES': ',,C',
  'TMIN': '   10',
  'PRCP': '    0',
  'SNWD_ATTRIBUTES': ',,C',
  'SNWD': '    0',
  'TMIN_ATTRIBUTES': ',,C'},
 {'DATE': '1999-12-03',
  'STATION': 'CA006158350',
  'SNOW': '    0',
  'PRCP_ATTRIBUTES': ',,C',
  'TMAX': '  106',
  'SNOW_ATTRIBUTES': ',,C',
  'TMAX_ATTRIBUTES': ',,C',
  'TMIN': '   52',
  'PRCP': '   36',
  'SNWD_ATTRIBUTES': ',,C',
  'SNWD': '    0',
  'TMIN_ATTRIBUTES': ',,C'},
 {'DATE': '1999-12-04',
  'STATION': 'CA006158350',
  'SNOW': '    0',
  'PRCP_ATTRIBUTES': ',,C',
  'TMAX': '  134',
  'SNOW_ATTRIBUTE