In [None]:
!pip install uv
!uv pip install -r  requirements.txt
!uv pip install streamlit
!uv pip install -U ipywidgets
!uv pip install shap snowflake-ml-python==1.19.0

In [None]:
%%sql -r dataframe_1
select current_role();
drop table EY_DATA_CHALLENGE.DATA_SCHEMA.WATERQUALITY_TRAINING;


    

In [None]:
#Update this VERSION_NUM to version your features, models etc!
VERSION_NUM = '0'
DB = "EY_DATA_CHALLENGE" 
SCHEMA = "DATA_SCHEMA" 
ROLE ="ACCOUNTADMIN"

In [None]:
import pandas as pd
import numpy as np
import sklearn
import math
import pickle
import shap
from datetime import datetime
import streamlit as st
from xgboost import XGBClassifier

# Snowpark ML
from snowflake.ml.registry import Registry
from snowflake.ml.modeling.tune import get_tuner_context
from snowflake.ml.modeling import tune
from entities import search_algorithm

#Snowflake feature store
from snowflake.ml.feature_store import FeatureStore, FeatureView, Entity, CreationMode

# Snowpark session
from snowflake.snowpark import DataFrame
from snowflake.snowpark.functions import col, to_timestamp, min, max, month, dayofweek, dayofyear, avg, date_add, sql_expr,year,quarter,date_trunc
from snowflake.snowpark.types import IntegerType
from snowflake.snowpark import Window

#setup snowpark session
from snowflake.snowpark.context import get_active_session
session = get_active_session()

session.use_database(DB)
session.use_schema(SCHEMA)
session

In [None]:
print("Reading table data...")
df_waterquality = session.table("WATERQUALITY_TRAINING")
df_waterquality.show(5)
df_waterquality.select(min("SAMPLE_DATE"),max("SAMPLE_DATE"))


### Feature Engineering with Snowpark APIs

In [None]:
#Create a dict with keys for feature names and values containing transform code

feature_eng_dict = dict()

#Timstamp features
feature_eng_dict["MONTH"] = month("SAMPLE_DATE")
feature_eng_dict["QUARTER"] = quarter("SAMPLE_DATE") 
feature_eng_dict["YEAR"] = year("SAMPLE_DATE") 
feature_eng_dict["QUARTER_DATE"] = date_trunc("quarter", col("SAMPLE_DATE"))

year_qtr_partition = Window.partition_by("YEAR","QUARTER").order_by("YEAR")
feature_eng_dict["AVG_YEAR_QTR_ALKALINITY"] = avg("TOTAL_ALKALINITY").over(year_qtr_partition)

feature_eng_dict["PRECEDING_AVG_YEAR_QTR_ALKALINITY"] = sql_expr(""" 
            AVG("TOTAL_ALKALINITY") OVER( PARTITION BY  "YEAR","QUARTER" ORDER BY  "SAMPLE_DATE" )
    """ )

df_waterquality = df_waterquality.with_columns(feature_eng_dict.keys(), feature_eng_dict.values())
df_waterquality.show(3)
df_waterquality.filter((df_waterquality["YEAR"]== 2012) & (df_waterquality["QUARTER"]== 2)).show(3)

In [None]:
%%sql -r dataframe_2

SELECT
    LATITUDE,
    DATE_PART(year, SAMPLE_DATE) as YEAR,
    DATE_PART(quarter, SAMPLE_DATE) as QUARTER,
    AVG(TOTAL_ALKALINITY) as AVG_ALKALINITY,
    LAG(AVG(TOTAL_ALKALINITY), 1) OVER (PARTITION BY LATITUDE ORDER BY DATE_PART(year, SAMPLE_DATE), DATE_PART(quarter, SAMPLE_DATE)) AS PREV_QUARTER_AVG,
    CASE 
        WHEN LAG(AVG(TOTAL_ALKALINITY), 1) OVER (PARTITION BY LATITUDE ORDER BY DATE_PART(year, SAMPLE_DATE), DATE_PART(quarter, SAMPLE_DATE)) IS NULL THEN ''
        WHEN LAG(AVG(TOTAL_ALKALINITY), 1) OVER (PARTITION BY LATITUDE ORDER BY DATE_PART(year, SAMPLE_DATE), DATE_PART(quarter, SAMPLE_DATE)) > AVG(TOTAL_ALKALINITY) THEN 'GETTING WORSE'
        ELSE 'GETTING BETTER'
    END AS TREND
FROM WATERQUALITY_TRAINING
where LATITUDE = -34.405833
GROUP BY LATITUDE, YEAR, QUARTER  
ORDER BY LATITUDE, YEAR, QUARTER  ;



In [None]:
import matplotlib.pyplot as plt
LATITUDE = -34.405833
# Convert Snowpark DataFrame to pandas
df_pandas  = df_waterquality.filter(col("LATITUDE") == LATITUDE).select("QUARTER_DATE", "AVG_YEAR_QTR_ALKALINITY").sort("QUARTER_DATE").to_pandas()

plt.figure(figsize=(12, 6))
plt.plot(df_pandas["QUARTER_DATE"], df_pandas["AVG_YEAR_QTR_ALKALINITY"], marker='o')
plt.xlabel('Quarter')
plt.ylabel('Average Alkalinity')
plt.title(f"Average Alkalinity Over Time for Latitude  {LATITUDE}")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()