# 2024 Public Service Data Challenge test notebook

`EN` Project name : Map air pollution data and other key indicators to improve policymaking
               AKA "AirTIME – Air trend information for managing exposure"

`FR` Nom du projet : Cartographie des données de pollution de l'air et autres indicateurs pour l'aide à l'élaboration des politiques

Team members / Équipe :
- [Rita So](rita.so@ec.gc.ca)  Team Lead / Gestionnaire de projet
- [Zoe Davis](zoe.davis@ec.gc.ca) Deputy Team Lead / Gestionnaire de projet adjointe
- [Kumari Gurusami](kumari.gurusamy@nrcan-rncan.gc.ca) NRCan / RNCan
- [Ke Gai](ke.gai@ec.gc.ca) ECCC
- [Andrea Hhazzawi](andrea.ghazzawi@tpsgc-pwgsc.gc.ca) TPSGC / PWGSC
- [Charles Ryan Haynes](charlesryan.haynes@ec.gc.ca) ECCC
- [Nicole Johnson](nicole.johnson2@agr.gc.ca) AGR
- [Yves Moisan](yves.moisan@ec.gc.ca) ECCC -> NRCan / RNCan/ RNCan (01/04/2024)


<div class="alert alert-block alert-info"><b>The challenge for me :</b><br>"My understanding is that the datasets (<b>ambient monitoring</b>, <b>emission data</b>, <b>AQ modelling</b>) are not present in one consolidated location online for ECCC access."</div>

IOW : **there are data silos**.

I believe once that problem of data access is out of the way half the problem (of having an understanding of the data to enlighten decision-making) will be solved.


# Just to show we can embed equations in a notebook
\begin{equation}
e^x=\sum_{i=0}^\infty \frac{1}{i!}x^i
\end{equation}

TOdo ; map

In [None]:
import datetime as dt
from datetime import date
import numpy as np
import pandas as pd
import geopandas as gpd
import polars as pl
import polars.selectors as cs
import pyarrow as pa
from deltalake import DeltaTable  # S3FileSystem ??
from deltalake.writer import write_deltalake
from great_tables import GT, html, md
from great_tables.data import islands
from tabulate import tabulate
from tqdm.notebook import tqdm
import hvplot.pandas # Won't be needed hopefully; we'll be using Polars
import hvplot.polars

In [None]:
from great_tables import GT, md, html, style, loc
from great_tables.data import airquality, islands

In [None]:
airquality_mini = airquality.head(10).assign(Year = 1973)
pl_airquality = pl.DataFrame(airquality_mini).select(
    "Year", "Month", "Day", "Ozone", "Solar_R", "Wind", "Temp"
)
gt_air = GT(pl_airquality)

(
    gt_air

    # Table header ----
    .tab_header(
        title = "New York Air Quality Measurements",
        subtitle = "Daily measurements in New York City (May 1-10, 1973)"
    )

    # Table column spanners ----
    .tab_spanner(
        label = "Time",
        columns = ["Year", "Month", "Day"]
    )
    .tab_spanner(
        label = "Measurement",
        columns = ["Ozone", "Solar_R", "Wind", "Temp"]
    )
    .cols_label(
        Ozone = html("Ozone,<br>ppbV"),
        Solar_R = html("Solar R.,<br>cal/m<sup>2</sup>"),
        Wind = html("Wind,<br>mph"),
        Temp = html("Temp,<br>&deg;F")
    )

    # Table styles ----
    .tab_style(
        style.fill("lightyellow"),
        loc.body(
            columns = cs.all(),
            rows = pl.col("Wind") == pl.col("Wind").max()
        )
    )
    .tab_style(
        style.text(weight = "bold"),
        loc.body("Wind", pl.col("Wind") == pl.col("Wind").max())
    )
)

In [None]:
type(airquality_mini)
type(pl_airquality)

# AQHI observation communities

In [None]:
AQHI_communities_df = pl.read_json("aqhi_community.geojson")
# From url = "https://dd.weather.gc.ca/air_quality/aqhi/atl/observation/monthly/csv/202302_MONTHLY_AQHI_ATL_SiteObs_BACKFILLED.csv"
#AQHI_stations_df = pl.read_json(url)


# Air Pollutant Emission Inventory
[Web page](https://data-donnees.az.ec.gc.ca/data/substances/monitor/canada-s-air-pollutant-emissions-inventory/)

In [None]:
EN_APEI_Can_Prov_Terr = pl.scan_csv("EN_APEI-Can-Prov_Terr.csv")

In [None]:
APEI = EN_APEI_Can_Prov_Terr.collect()
APEI

In [None]:
# Your existing query
# Define the start and end dates; will be used for the graph title
#start_date = date(2024, 3, 4)
#end_date = date(2024, 3, 8)

APEI_query = (
    APEI
    .filter(pl.col("Region") == "AB")
    .sort(pl.col("Source"), descending=False)
)


In [None]:
APEI_query

In [None]:
# Plot the DataFrame using hvplot
plot = APEI_query.hvplot.line(x='Year', 
                                    y='TPM (t)', 
                                    by='Source', 
                                    title=f"TPM (t) values for Alberta")

# Set the y-axis label to include the 'uom' value
# Assuming 'uom' is a constant value for all rows in your filtered DataFrame
uom = "??" # This should be dynamically fetched if it varies
plot.opts(ylabel=f"TPM (t) units = {uom}")
plot.opts(xlabel=f"Year")
plot

# AQHI observations
[Atlantic realtime observation web page](https://dd.weather.gc.ca/air_quality/aqhi/atl/observation/realtime/csv/)

In [None]:
# AQHI_ATL_obs = pl.scan_csv("2024031007_AQHI_ATL_SiteObs.csv")
AQHI_ATL_obs_guessed_date = pl.read_csv("2024031007_AQHI_ATL_SiteObs.csv", try_parse_dates=True) # WOW : that turned "2024-03-10" to an actual date object !!

In [None]:
# AQHI_ATL_obs

In [None]:
AQHI_ATL_obs_guessed_date

In [None]:
# Your existing query
# Define the start and end dates; will be used for the graph title
start_date = date(2024, 3, 4)
end_date = date(2024, 3, 8)

basic_query_read = (
    AQHI_ATL_obs_guessed_date
    .filter(pl.col("Date").is_between(start_date, end_date))
    .sort(pl.col("Date"), descending=True)
)


In [None]:
basic_query_read

In [None]:
# Plot the DataFrame using hvplot
plot = basic_query_read.hvplot.line(x='Date', 
                                    y='AADCE', 
                                    by='Hour (UTC)', 
                                    title=f"AADCE values between {start_date} and {end_date}")

# Set the y-axis label to include the 'uom' value
# Assuming 'uom' is a constant value for all rows in your filtered DataFrame
uom = "??" # This should be dynamically fetched if it varies
plot.opts(ylabel=f"Temperature ?? ({uom})")
plot.opts(xlabel=f"Date J/MM")
plot

# AQHI forecast
[Atlantic forecast web page](https://dd.weather.gc.ca/air_quality/aqhi/atl/forecast/model/csv/?C=M;O=D)

```
[TXT] 2024031800_AQHI_ATL_UMOSAQMIST.csv    2024-03-18 03:40  9.4K  
[TXT] 2024031800_PM2.5_ATL_UMOSAQMIST.csv   2024-03-18 03:40   12K  
[TXT] 2024031800_O3_ATL_UMOSAQMIST.csv      2024-03-18 03:40   14K  
[TXT] 2024031800_NO2_ATL_UMOSAQMIST.csv     2024-03-18 03:40   12K 
```

In [None]:
AQHI_ATL_forecast_AQHI = pl.scan_csv("2024031800_AQHI_ATL_UMOSAQMIST.csv")
AQHI_ATL_forecast_PM2_5 = pl.scan_csv("2024031800_PM2.5_ATL_UMOSAQMIST.csv")
AQHI_ATL_forecast_O3 = pl.scan_csv("2024031800_O3_ATL_UMOSAQMIST.csv")
AQHI_ATL_forecast_NO2 = pl.scan_csv("2024031800_NO2_ATL_UMOSAQMIST.csv")


In [None]:
AQHI_ATL_forecast_AQHI_df = AQHI_ATL_forecast_AQHI.collect()
AQHI_ATL_forecast_PM2_5_df = AQHI_ATL_forecast_PM2_5.collect()
AQHI_ATL_forecast_O3_df = AQHI_ATL_forecast_O3.collect()
AQHI_ATL_forecast_NO2_df = AQHI_ATL_forecast_NO2.collect()


In [None]:
# AQHI_ATL_forecast_AQHI_df #  shape: (25, 74); has "cgndb" as field name for station location

# All next forecasts have 'stationId' as field name for station location

AQHI_ATL_forecast_PM2_5_df # shape: (31, 74)
# AQHI_ATL_forecast_O3_df # shape: (31, 74)
# AQHI_ATL_forecast_NO2_df # shape: (31, 74)


In [None]:
# All next forecasts have 'stationId' as field name for station location

AQHI_ATL_forecast_PM2_5_df # shape: (31, 74)
AQHI_ATL_forecast_O3_df # shape: (31, 74)
AQHI_ATL_forecast_NO2_df # shape: (31, 74)

index = pl.col("stationId").alias("Station Id").struct(pl.col("date").alias("date"))
combined_df = pl.concat([df1, df2, df3], index)



In [None]:
type(AQHI_ATL_forecast_AQHI.collect().to_pandas())
#AQHI_ATL_forecast_PM2_5.collect()
#AQHI_ATL_forecast_O3.collect()
#AQHI_ATL_forecast_NO2.collect()


In [None]:
# AQHI_ATL_forecast_AQHI.collect().hvplot()
AQHI_pd_df = AQHI_ATL_forecast_AQHI.collect().to_pandas() # hvplot explorer does not support Polars dataframes yet
AQHI_numerical_df = AQHI_pd_df.select_dtypes(include=['int64', 'float64']) # Select only columns with numeric values
AQHI_explorer = AQHI_numerical_df.hvplot.explorer()
# AQHI_pd_df.columns
#AQHI_explorer = AQHI_pd_df.hvplot.explorer()

In [None]:
AQHI_ATL_obs_guessed_date_explorer = AQHI_ATL_obs_guessed_date.to_pandas().hvplot.explorer()
# AQHI_ATL_obs_guessed_date_explorer

# AQHI_ATL_forecast_AQHI.collect().hvplot()
# AQHI_pd_df = AQHI_ATL_forecast_AQHI.collect().to_pandas() # hvplot explorer does not support Polars dataframes yet
# AQHI_numerical_df = AQHI_pd_df.select_dtypes(include=['int64', 'float64']) # Select only columns with numeric values
# AQHI_explorer = AQHI_numerical_df.hvplot.explorer()
# AQHI_pd_df.columns
#AQHI_explorer = AQHI_pd_df.hvplot.explorer()

In [None]:
AQHI_ATL_obs_guessed_date_explorer

In [None]:
from bokeh.sampledata.penguins import data as df

df.head(2)
type(df)

In [None]:
hvexplorer = df.hvplot.explorer()
hvexplorer