# CMC ADE -- read data from Delta Lake tables
`FR`
Le présent notebook montre les façons d'interagir avec des données au format [Delta Lake](https://delta-io.github.io/delta-rs/).

`EN`
This notebook shows how to manipulate [Delta Lake](https://delta-io.github.io/delta-rs/) tables.

In [None]:
import datetime as dt
from datetime import date
import os
import pathlib
import tarfile
import time
import daft
import deltalake
import numpy as np
import pandas as pd
import polars as pl
import polars.selectors as cs
import pyarrow as pa
from deltalake import DeltaTable  # S3FileSystem ??
from deltalake.writer import write_deltalake
from tabulate import tabulate
from tqdm.notebook import tqdm
# opt.maxBytes = 131072
import matplotlib.pyplot as plt
import hvplot.pandas # Won't be needed hopefully; we'll be using Polars
import hvplot.polars
import seaborn as sns
import seaborn.objects as so
import geopandas as gpd
from shapely.geometry import Point
# hvplot.extension("plotly")

In [None]:
from great_tables import GT, md, html, style, loc
from great_tables.data import airquality, islands

`EN` Open the Delta Table with Polars and test the various Delta tables (see `CMCADE-ingest.ipynb`) 
- `tar_swob_no_optimization` : dataframes written (then appended) without partitioning

`FR` À venir

In [None]:
delta_plain = pl.scan_delta("tar_swob_no_optimization").collect()
bad_data = delta_plain.count()['name'] - delta_plain.count()['value']
min_date = delta_plain['date_tm'].min()
max_date = delta_plain['date_tm'].max()

print(f"Minimum Date: {min_date}")
print(f"Maximum Date: {max_date}")


In [None]:
delta_plain

In [None]:
# pl_airquality = pl.DataFrame(airquality_mini).select(
#     "Year", "Month", "Day", "Ozone", "Solar_R", "Wind", "Temp"
# )
bad_values = str(bad_data[0])
gt_air = GT(delta_plain.count())
(
    gt_air
    .fmt_integer(columns = delta_plain.columns, sep_mark=" ",)
    
    # Table header ----
    .tab_header(
        title = "CMC weather observations data counts",
        subtitle = f"Between {min_date} and {max_date} (inclusive); Bad data values = {bad_values}"
    )
    
    # Table styles ----
    .tab_style(
        style.fill("lightgray"),
        loc.body(
            columns = cs.all()
        )
    )

)

In [None]:
# Your existing query
# Define the start and end dates; will be used for the graph title
start_date = date(2024, 3, 6)
end_date = date(2024, 3, 24)

basic_query_read = (
    pl.read_delta("tar_swob_no_optimization")
    .filter(pl.col("name") == "air_temp")
    .filter(pl.col("stn_nam") == "STE-FOY (U. LAVAL)")
#    .filter(pl.col("stn_nam").is_in(["STE-FOY (U. LAVAL)", "MALAHAT", "ABEE AGDM", "ALDERSVILLE"]))
#    .filter(pl.col("stn_nam").is_in(stn_nam_list))
#    .filter(pl.col("date_tm").is_between(start_date, end_date))
    .sort(pl.col("date_tm"), descending=True)
     .sort(pl.col(["date_tm","stn_nam"]), descending=False)
)


In [None]:
# Plot the DataFrame using hvplot
hvplot.extension("bokeh")
# hvplot.extension("plotly")
# hvplot.extension("matplotlib")
plot = basic_query_read.hvplot.line(x='date_tm', 
                                    y='value', 
                                    by='stn_nam', 
                                    title=f"Air Temperature between {start_date} and {end_date}")

# Set the y-axis label to include the 'uom' value
# Assuming 'uom' is a constant value for all rows in your filtered DataFrame
uom = "°C" # This should be dynamically fetched if it varies
plot.opts(ylabel=f"Temperature ({uom})")
plot.opts(xlabel=f"Date J/MM")
plot

# Geo use case
If instead of specifying a list of station names we had a map that would allow selecting stations
by some geo-specDELTA BURNS BOGific query, e.g. all stations within an arbitrary polygon ?

We'll keep this example simple.  Since we have data in `degrees` I will ask for all stations that 
fall within a radius of a point.  In my case, the point will be the city of Vancouver.

In a real world application we would propose the user with a way to supply a range in a decent distance dimension (e.g. kilometers)

What we want is to replace the list in

` .filter(pl.col("stn_nam").is_in(["STE-FOY (U. LAVAL)", "MALAHAT", "ABEE AGDM", "ALDERSVILLE"])) `

above with a list of names coming from our buffer (see below)

`["DELTA BURNS BOG", "POINT ATKINSON", "VANCOUVER HARBOUR CS", "VANCOUVER SEA ISLAND CCG", "WEST VANCOUVER AUT"]`

# Create geo dataframe

We first take the first occurrence of each station name and corresponding lat, long

We could save as GeoJSON, but for this notebook we'll just keep the geo dataframe

In [None]:
stn_loc_df = delta_plain.unique(subset=['stn_nam'], keep='first').sort("stn_nam").select(['stn_nam', 'lat', 'long'])

In [None]:
stn_loc_df

In [None]:
# Step 1: Convert DataFrame to GeoDataFrame
geometry = [Point(xy) for xy in zip(stn_loc_df['long'], stn_loc_df['lat'])]

#geo_df = gpd.GeoDataFrame(stn_loc_df, geometry=geometry)
geo_df = gpd.GeoDataFrame(stn_loc_df[['stn_nam']], geometry=geometry)

# Assuming geo_df is your GeoDataFrame
geo_df.rename(columns={0: 'stn_nam'}, inplace=True)


# Step 2: Save as GeoJSON
#geo_df.to_file("output.geojson", driver="GeoJSON")


In [None]:
geo_df

# Determine a point of interest and find all stations within a radius

In [None]:
# Assuming geo_df is your GeoDataFrame with 'stn_nam' and 'geometry' columns
# Define the point of interest (latitude, longitude)
# In our case, vancouver -123.1139456, 49.2604134
point_of_interest = Point(-123.1139456, 49.2604134)

# Create a buffer around the point of interest in degrees
# Note: This is a simplification and might not accurately represent a real-world distance
buffer_distance_in_degrees = 0.2 # Example buffer distance in degrees
buffer = point_of_interest.buffer(buffer_distance_in_degrees)

# Convert the buffer to a GeoDataFrame
buffer_gdf = gpd.GeoDataFrame(geometry=[buffer], crs=geo_df.crs)

# Perform a spatial join to find all stations within the buffer
stations_within_buffer = gpd.sjoin(geo_df, buffer_gdf, how='inner', predicate='within')

# Print the stations within the buffer
print(stations_within_buffer)


In [None]:
# Derive the list of station names from the geo dataframe and visualize as per above
stn_nam_list = stations_within_buffer['stn_nam'].values.tolist()

# Print the list of station names
print(stn_nam_list)
