# 01 — Pandas `groupby` + Datetime on GHCN Parquet
This exercise loads the Parquet produced by **Notebook 02** and demonstrates standard `groupby` + datetime analyses.
You can read locally or (after you push to GitHub) via a **cloud HTTPS raw** URL.

In [None]:
import pandas as pd, numpy as np
from pathlib import Path

# Primary local artifact produced by 02_fetch_ghcn_il_to_parquet.ipynb
LOCAL_PARQUET = '../data/ghcn_il_top4_daily.parquet'

# After pushing to GitHub, set this to your repo's raw URL to read from the cloud:
CLOUD_PARQUET = None  # e.g., 'https://raw.githubusercontent.com/USER/REPO/main/data/ghcn_il_top4_daily.parquet'

def read_cloud_first(cloud_url, local_fallback):
    try:
        if cloud_url:
            df = pd.read_parquet(cloud_url)  # needs pyarrow/fastparquet
            print('Loaded from cloud:', cloud_url)
            return df
    except Exception as e:
        print('Cloud read failed → using local:', type(e).__name__, str(e)[:120])
    print('Loaded local:', local_fallback)
    return pd.read_parquet(local_fallback)

df = read_cloud_first(CLOUD_PARQUET, LOCAL_PARQUET).sort_values(['ID','DATE']).reset_index(drop=True)
df.dtypes

## 1) Add datetime helpers

In [None]:
df['DATE'].dt.year

In [None]:
df['year']  = pd.to_datetime(df['DATE']).dt.year
df['month'] = pd.to_datetime(df['DATE']).dt.month
df['ym']    = pd.to_datetime(df['DATE']).dt.to_period('M')
df.head()

## 2) Monthly means & totals by station

In [None]:
monthly = (
    df.groupby(['ID','ym'], as_index=False)
      .agg(TMIN=('TMIN','mean'), TMAX=('TMAX','mean'), PRCP=('PRCP','sum'))
)
monthly_piv = monthly.pivot(index='ym', columns='ID', values='PRCP')
#monthly.head(), monthly_piv.head()

In [None]:
monthly_piv.plot()

**Try it:** Compute monthly *median* `TMAX` by station.

In [None]:
monthly_median = df.groupby(['ID','ym'], as_index=False)['TMAX'].median()
monthly_median

In [None]:
monthly_median = df.groupby(['ID','year'], as_index=False)['TMAX'].median()
monthlycomposite_piv = monthly_median.pivot(index='year', columns='ID', values='TMAX')
monthlycomposite_piv.plot()

## 3) Annual precipitation totals and rankings

Rank each station in each year for its precipitation totals

In [None]:
annual_prcp = (
    df.groupby(['ID','year'], as_index=False)
      .agg(annual_prcp_mm=('PRCP','sum'))
)
annual_prcp['rank_within_year'] = annual_prcp.groupby('year')['annual_prcp_mm'].rank(ascending=False, method='min')
annual_prcp.sort_values(['year','rank_within_year'])

**Try it:** Rank warmest daily report station per year using mean `TMAX`.

In [None]:
# Drop rows without TMAX, then find the row with the max TMAX within each year
df_nonan = df.dropna(subset=['TMAX']).copy()
df_nonan['DATE'] = pd.to_datetime(df_nonan['DATE'])
df_nonan['year'] = df_nonan['DATE'].dt.year

idx = df_nonan.groupby('year')['TMAX'].idxmax()

warmest_by_abs = (
    df_nonan.loc[idx, ['year', 'ID', 'DATE', 'TMAX']]
            .rename(columns={'TMAX': 'tmax_abs_c'})
            .sort_values('year')
            .reset_index(drop=True)
)

warmest_by_abs


## 4) Station-by-month climatology (using whatever is present)

In [None]:
climo = df.groupby(['ID','month'], as_index=False)['TMAX'].mean()
climo_piv = climo.pivot(index='month', columns='ID', values='TMAX')
climo_piv

**Try it:** Compute monthly precipitation climatology (sum of `PRCP` across years).

In [None]:
# Sum precipitation within each station × year × month
monthly_sums = (
    df.groupby(['ID', 'year', 'month'], as_index=False)['PRCP']
      .sum()
      .rename(columns={'PRCP': 'prcp_month_sum_mm'})
)

# Climatology: mean of those monthly sums across years (per station × month)
climo_prcp = (
    monthly_sums.groupby(['ID', 'month'], as_index=False)['prcp_month_sum_mm']
                .mean()
                .rename(columns={'prcp_month_sum_mm': 'prcp_month_climo_mm'})
)

climo_pivot = climo_prcp.pivot(index='month', columns='ID', values='prcp_month_climo_mm')
climo_pivot