In [None]:
!pip install uv
!uv pip install  -r requirements.txt

### Restat the Kernel ** Before proceeding to next cell.

In [None]:
import snowflake
from snowflake.snowpark.functions import col
import pandas as pd

from snowflake.snowpark.context import get_active_session
session = get_active_session()
print(session)

In [None]:
session.sql("CREATE WAREHOUSE IF NOT EXISTS COMPUTE_WH WITH WAREHOUSE_SIZE='X-SMALL'").collect()
session.sql("CREATE DATABASE IF NOT EXISTS EY_DATA_CHALLENGE").collect()
session.sql("CREATE SCHEMA IF NOT EXISTS EY_DATA_CHALLENGE.DATA_SCHEMA").collect()
session.sql("CREATE STAGE IF NOT EXISTS EY_DATA_CHALLENGE.DATA_SCHEMA.DATA_STAGE").collect()

In [None]:
session.use_database("EY_DATA_CHALLENGE")
session.use_schema("DATA_SCHEMA")

#load training data to training dataframes.
waterquality_train_df = pd.read_csv("water_quality_training_dataset.csv")
landsat_train_df = pd.read_csv("landsat_features_training.csv")
terraclimate_train_df = pd.read_csv("terraclimate_features_training.csv")



In [None]:

# Renaming Column Names To Maintain Snowflake Convention and replace spaces with _
waterquality_train_df.columns = [column.upper().replace(' ','_') for column in waterquality_train_df.columns]

# Keep original as VARCHAR and create new DATE column

waterquality_train_df['SAMPLE_DATE'] = pd.to_datetime(waterquality_train_df['SAMPLE_DATE'], format='%d-%m-%Y').dt.date  # Convert to date only


# Write the dataframe to a table in the database.

session.write_pandas(
     waterquality_train_df,
     "WATERQUALITY_TRAINING",
     database = "EY_DATA_CHALLENGE",
     schema = "DATA_SCHEMA",
     overwrite = True,
     auto_create_table =True
     )

In [None]:
landsat_train_df.columns = [column.upper().replace(' ','_') for column in landsat_train_df.columns ]
landsat_train_df['SAMPLE_DATE'] = pd.to_datetime(landsat_train_df['SAMPLE_DATE'], format='%d-%m-%Y').dt.date  # Convert to date only

# write to a table 
session.write_pandas(
    landsat_train_df,
    table_name ="LANDSAT_TRAINING",
    database ="EY_DATA_CHALLENGE",
    schema = "DATA_SCHEMA",
    auto_create_table = True,
    overwrite = True
)

In [None]:
terraclimate_train_df.columns = [ column.upper().replace(' ','_') for column in terraclimate_train_df.columns ]
terraclimate_train_df['SAMPLE_DATE'] = pd.to_datetime(terraclimate_train_df['SAMPLE_DATE'], format='%d-%m-%Y').dt.date  # Convert to date only

session.write_pandas(
    terraclimate_train_df,
    table_name ="TERRACLIMATE_TRAINING",
    database ="EY_DATA_CHALLENGE",
    schema = "DATA_SCHEMA",
    auto_create_table = True,
    overwrite = True
)

In [None]:
waterquality_df=session.table("WATERQUALITY_TRAINING")
terraclimate_df=session.table("TERRACLIMATE_TRAINING")
landsat_df = session.table("LANDSAT_TRAINING")

print(f"Total row counts - WATERQUALITY_TRAINING:{waterquality_df.count()} , ColummCount: {len(waterquality_df.columns)}")
waterquality_df.sample(n=2).show()
print(f"Total row counts - LANDSAT_TRAINING:{landsat_df.count()} , ColummCount: {len(landsat_df.columns)}")
landsat_df.sample(n=2).show()
print(f"Total row counts - TERRACLIMATE_TRAINING:{terraclimate_df.count()} , ColummCount: {len(terraclimate_df.columns)}")
terraclimate_df.sample(n=2).show()


In [None]:
waterquality_df.filter(col("SAMPLE DATE") == "29-04-2015").show() 
terraclimate_df.filter(col("SAMPLE DATE") == "29-04-2015").show()
landsat_df.filter(col("SAMPLE DATE") == "29-04-2015").show()

In [None]:
waterquality_df.filter(col("LATITUDE")== "-25.73411").show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
f, axes = plt.subplots(1, 2)
sns.boxplot(y=waterquality_df.to_pandas()["TOTAL ALKALINITY"], ax=axes[0], width=.5 )
#sns.boxplot(y=waterquality_df.to_pandas()["ELECTRICAL CONDUCTANCE"], ax=axes[1] , width=.5)
#sns.boxplot(y=waterquality_df.to_pandas()["DISSOLVED REACTIVE PHOSPHORUS"], ax=axes[3] , width=.5)

#sns.boxplot(x=df_without_outlier.to_pandas()["TOTAL ALKALINITY"], ax=axes[1])
plt.show()

In [None]:
waterquality_df.filter(col("TOTAL ALKALINITY") > 320).show()
waterquality_df.filter(col("TOTAL ALKALINITY") < 60).show()


In [None]:
waterquality_train_df.describe(include="all")
#print(statistics)
#print(type(statistics))


In [None]:
from snowflake.snowpark.functions   import mean, stddev, abs, date_part


mean_value = waterquality_df.select(mean("TOTAL ALKALINITY")).collect()[0][0]
print(f"Mean Value: {mean_value}")
std_value = waterquality_df.select(stddev("TOTAL ALKALINITY")).collect()[0][0]
print(f"Standard Devidation: { std_value }")
df_without_outlier = waterquality_df.filter((abs(waterquality_df["TOTAL ALKALINITY"] - mean_value)) < (3 * std_value))
df_without_outlier.count()

In [None]:
from snowflake.snowpark.functions import mean, stddev

Parameters = ["TOTAL ALKALINITY", "ELECTRICAL CONDUCTANCE", "DISSOLVED REACTIVE PHOSPHORUS"]

for label in Parameters:
    mean_value = waterquality_df.select(mean(label)).collect()[0][0]
    std_value = waterquality_df.select(stddev(label)).collect()[0][0]  # Add () here
    print(f"{label} -  Mean: {mean_value}  | StdDev: {std_value}")
    outlier = waterquality_df.filter((abs(waterquality_df[label] - mean_value)) > (2.5 * std_value))
    print(f"Outlier for {label} ({outlier.count()} rows) ") 
    
    if(outlier.count()>0):
        outlier.select("LATITUDE","LONGITUDE","SAMPLE DATE",label).show()


### Create an outlier model


In [None]:
%%sql -r dataframe_1
CREATE  OR REPLACE  SNOWFLAKE.ML.ANOMALY_DETECTION outlier_base_model(
  INPUT_DATA => TABLE(SELECT "SAMPLE DATE", "DISSOLVED REACTIVE PHOSPHORUS" FROM WATERQUALITY_TRAINING WHERE LATITUDE = -25.73411),
  TIMESTAMP_COLNAME => "SAMPLE DATE",
  TARGET_COLNAME => "DISSOLVED REACTIVE PHOSPHORUS"
)
