# ATMS 523 Homework 3
## Wenhan Tang (UIN: 678054985)

This notebook demonstrates how to download and analyze daily temperature data from the NOAA GHCN-Daily dataset on AWS.  
We extract the all-time record high/low temperatures, compute 1991–2020 climate normals, and visualize record, average, and actual temperature series for a chosen station.

### Import packages

In [1]:
import pandas as pd
import datetime as dtm
from datetime import datetime

from bokeh.plotting import figure
from bokeh.palettes import Blues4
from bokeh.models import DataRange1d
from bokeh.io import show,output_notebook

import warnings
warnings.filterwarnings("ignore")

### Function to extract a station’s `record_min_temp`, `record_max_temp`, `average_min_temp`, `average_max_temp`, and the actual temperature time series

In [2]:
def group_doy(df):
    ser=df[~((df.index.month==2)&(df.index.day==29))]
    ser = pd.to_numeric(ser["DATA_VALUE"] / 10, errors="coerce")
    idx = ser.index
    doy_compact = idx.dayofyear - ((idx.is_leap_year) & (idx.month > 2)).astype(int)
    return ser.groupby(doy_compact)

def extract_Tminmax(
    station_id,
    start_datetime_for_average = "1991-01-01",
    end_datetime_for_average = "2020-12-31"
):


    df_sta = pd.read_parquet(
        "s3://noaa-ghcn-pds/parquet/by_station/STATION=" + station_id + "/",
        storage_options={"anon": True},
    )
    
    df_sta['DATE'] = pd.to_datetime(
        df_sta['DATE'].apply(lambda x: datetime.strptime(x, '%Y%m%d'))
    )
    df_sta = df_sta.set_index('DATE').sort_index()
    df_tmax = df_sta.loc[df_sta['ELEMENT'] == 'TMAX']
    df_tmin = df_sta.loc[df_sta['ELEMENT'] == 'TMIN']
    min_temp = df_tmin["DATA_VALUE"] / 10
    max_temp = df_tmax["DATA_VALUE"] / 10
    df_sta_mean_period = df_sta.loc[
        (df_sta.index >= datetime.strptime(start_datetime_for_average, "%Y-%m-%d"))\
        & (df_sta.index <= datetime.strptime(end_datetime_for_average, "%Y-%m-%d"))
    ]
    average_min_temp = group_doy(
        df_sta_mean_period.loc[df_sta_mean_period['ELEMENT'] == 'TMIN']
    ).mean()
    average_max_temp = group_doy(
        df_sta_mean_period.loc[df_sta_mean_period['ELEMENT'] == 'TMAX']
    ).mean()
    
    record_min_temp = group_doy(df_tmin).min()
    record_max_temp = group_doy(df_tmax).max()
    
    df_clim = {
        "record_min_temp": record_min_temp,
        "record_max_temp": record_max_temp,
        "average_min_temp": average_min_temp,
        "average_max_temp": average_max_temp,
    }
    df_real = {
        "actual_min_temp": min_temp,
        "actual_max_temp": max_temp,
    }
    return pd.DataFrame(df_clim), pd.DataFrame(df_real)

### Visualization function using bokeh

In [3]:
def plot_weather(df_clim, df_real, plot_year, CityName):
    """
        Adapted from https://github.com/bokeh/bokeh/tree/branch-3.9/examples/server/app/weather
    """
    plot = figure(x_axis_type="datetime", width=800, tools="", toolbar_location=None)
    plot.title.text = "Weather for " + CityName + " in " + str(plot_year)
    
    df_clim_plot = df_clim.copy()
    df_real_plot = df_real.iloc[
        (df_real.index >= datetime(plot_year, 1, 1))\
        & (df_real.index < datetime(plot_year + 1, 1, 1))
    ]
    df_clim_plot.index = pd.to_datetime(
        [f"{plot_year}-{doy}" for doy in df_clim_plot.index],
        format="%Y-%j"
    )
    
    df_clim_plot["left"] = df_clim_plot.index - dtm.timedelta(days=0.5)
    df_clim_plot["right"] = df_clim_plot.index + dtm.timedelta(days=0.5)
    df_real_plot["left"] = df_real_plot.index - dtm.timedelta(days=0.5)
    df_real_plot["right"] = df_real_plot.index + dtm.timedelta(days=0.5)
    
    plot.quad(
        top='record_max_temp', bottom='record_min_temp', left='left', right='right',
        color=Blues4[2], source=df_clim_plot, legend_label="Record"
    )
    plot.quad(
        top='average_max_temp', bottom='average_min_temp', left='left', right='right',
        color=Blues4[1], source=df_clim_plot, legend_label="Average"
    )
    plot.quad(
        top='actual_max_temp', bottom='actual_min_temp', left='left', right='right',
        color=Blues4[0], alpha=0.5, line_color="black", source=df_real_plot,
        legend_label="Actual"
    )
    
    plot.xaxis.axis_label = None
    plot.yaxis.axis_label = "Temperature (˚C)"
    plot.axis.axis_label_text_font_style = "bold"
    plot.x_range = DataRange1d(range_padding=0.0)
    plot.grid.grid_line_alpha = 0.3
    return plot

### Access station info

In [4]:
stn_ids = pd.read_fwf(
    'http://noaa-ghcn-pds.s3.amazonaws.com/ghcnd-stations.txt',
    header=None,
    infer_nrows=1000
)
stn_ids.columns = ['ID','LAT','LON','ELEV','UKN','NAME','GSN','WBAN']
periods = pd.read_fwf(
    'http://noaa-ghcn-pds.s3.amazonaws.com/ghcnd-inventory.txt',
    header=None,
    infer_nrows=1000
)
periods.columns = ['ID','LAT','LON','ELEM','TiMIN','TiMAX']
merged_stns = pd.merge(stn_ids,periods,how='left',left_on='ID',right_on='ID')
merged_stns = merged_stns[
    (merged_stns['ELEM'] == 'TMAX')\
    & (merged_stns['TiMAX'] >= 2020)\
    & (merged_stns['TiMIN'] <= 1991)
]
merged_stns_sorted = merged_stns.sort_values('TiMIN', ascending=True)

### City and year for visualization
Set `CityName` and `plot_year` to choose the city and year you want to plot.

In [5]:
# City name
#CityName = 'CHAMPAIGN'
CityName = 'MADISON'

# Year for plotting
plot_year = 2019

### Stations available

In [6]:
city_stations = merged_stns_sorted[
    merged_stns_sorted['NAME'].str.contains(CityName, regex=False)
]
assert len(city_stations) > 0, "Couldn't find data for " + CityName + ", please try another city."
city_stations

Unnamed: 0,ID,LAT_x,LON_x,ELEV,UKN,NAME,GSN,WBAN,LAT_y,LON_y,ELEM,TiMIN,TiMAX
492063,USC00133007,40.6222,-91.3336,160.6,IA,FT MADISON,,,40.6222,-91.3336,TMAX,1893.0,2025.0
515370,USC00174927,44.7975,-69.8889,67.1,ME,MADISON,,,44.7975,-69.8889,TMAX,1894.0,2025.0
682348,USC00465563,38.1025,-81.8464,210.0,WV,MADISON 3NNW,,,38.1025,-81.8464,TMAX,1894.0,2025.0
552751,USC00246157,45.4875,-111.6336,1446.3,MT,NORRIS MADISON PWR HOUSE,,,45.4875,-111.6336,TMAX,1907.0,2025.0
737065,USW00014837,43.1406,-89.3453,261.8,WI,MADISON DANE CO RGNL AP,,72641.0,43.1406,-89.3453,TMAX,1939.0,2025.0
533780,USC00214994,45.0022,-96.1661,327.7,MN,MADISON WWTP,,,45.0022,-96.1661,TMAX,1940.0,2025.0
652348,USC00415477,30.9392,-95.9203,76.8,TX,MADISONVILLE,,,30.9392,-95.9203,TMAX,1942.0,2025.0
506778,USC00155067,37.3475,-87.5239,129.5,KY,MADISONVILLE,,,37.3475,-87.5239,TMAX,1948.0,2025.0
635026,USC00395090,43.9906,-97.0922,500.8,SD,MADISON 2SE,,,43.9906,-97.0922,TMAX,1961.0,2025.0
684677,USC00470273,43.0411,-89.4286,265.2,WI,UW ARBORETUM - MADISON,,,43.0411,-89.4286,TMAX,1971.0,2025.0


### Select a station and get the temperature data

In [7]:
station_id = city_stations.iloc[-1]["ID"]
df_clim, df_real = extract_Tminmax(station_id)
df_clim

Unnamed: 0_level_0,record_min_temp,record_max_temp,average_min_temp,average_max_temp
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,-26.7,8.3,-12.095238,-2.833333
2,-28.3,10.0,-11.948148,-1.729630
3,-29.4,11.7,-11.344828,-1.496552
4,-27.2,9.4,-10.578571,-1.514286
5,-33.3,9.4,-12.472414,-1.975862
...,...,...,...,...
361,-25.0,10.6,-8.789655,0.724138
362,-25.0,11.7,-9.500000,0.113793
363,-27.2,17.2,-9.031034,0.520690
364,-25.6,13.3,-10.129630,0.385185


### Visualization

In [8]:
plot = plot_weather(df_clim, df_real, plot_year = plot_year, CityName = CityName)
output_notebook()
show(plot)