In [None]:
!pip install uv
!uv pip install  -r requirements.txt

In [None]:
import snowflake
from snowflake.snowpark.functions import col
import pandas as pd

from snowflake.snowpark.context import get_active_session
session = get_active_session()

In [None]:
session.sql("CREATE WAREHOUSE IF NOT EXISTS COMPUTE_WH WITH WAREHOUSE_SIZE='X-SMALL'").collect()
session.sql("CREATE DATABASE IF NOT EXISTS EY_DATA_CHALLENGE").collect()
session.sql("CREATE SCHEMA IF NOT EXISTS EY_DATA_CHALLENGE.DATA_SCHEMA").collect()
session.sql("CREATE STAGE IF NOT EXISTS EY_DATA_CHALLENGE.DATA_SCHEMA.DATA_STAGE").collect()

In [None]:
session.use_database("EY_DATA_CHALLENGE")
session.use_schema("DATA_SCHEMA")

#load training data to training dataframes.
waterquality_train_df = pd.read_csv("water_quality_training_dataset.csv")
landsat_train_df = pd.read_csv("landsat_features_training.csv")
terraclimate_train_df = pd.read_csv("terraclimate_features_training.csv")



In [None]:

# Renaming Column Names To Maintain Snowflake Convention
waterquality_train_df.columns = [column.upper() for column in waterquality_train_df.columns]

# Write the dataframe to a table in the database.

session.write_pandas(
     waterquality_train_df,
     "WATERQUALITY_TRAINING",
     database = "EY_DATA_CHALLENGE",
     schema = "DATA_SCHEMA",
     overwrite = True,
     auto_create_table =True
     )

In [None]:
landsat_train_df.columns = [column.upper() for column in landsat_train_df.columns ]
# write to a table 
session.write_pandas(
    landsat_train_df,
    table_name ="LANDSAT_TRAINING",
    database ="EY_DATA_CHALLENGE",
    schema = "DATA_SCHEMA",
    auto_create_table = True,
    overwrite = True
)

In [None]:
terraclimate_train_df.columns = [ column.upper() for column in terraclimate_train_df.columns ]
session.write_pandas(
    terraclimate_train_df,
    table_name ="TERRACLIMATE_TRAINING",
    database ="EY_DATA_CHALLENGE",
    schema = "DATA_SCHEMA",
    auto_create_table = True,
    overwrite = True
)

In [None]:
waterquality_df=session.table("WATERQUALITY_TRAINING")
terraclimate_df=session.table("TERRACLIMATE_TRAINING")
landsat_df = session.table("LANDSAT_TRAINING")

print(f"Total row counts - WATERQUALITY_TRAINING:{waterquality_df.count()} , ColummCount: {len(waterquality_df.columns)}")
waterquality_df.sample(n=2).show()
print(f"Total row counts - LANDSAT_TRAINING:{landsat_df.count()} , ColummCount: {len(landsat_df.columns)}")
landsat_df.sample(n=2).show()
print(f"Total row counts - TERRACLIMATE_TRAINING:{terraclimate_df.count()} , ColummCount: {len(terraclimate_df.columns)}")
terraclimate_df.sample(n=2).show()


In [None]:
waterquality_df.filter(col("SAMPLE DATE") == "29-04-2015").show() 
terraclimate_df.filter(col("SAMPLE DATE") == "29-04-2015").show()
landsat_df.filter(col("SAMPLE DATE") == "29-04-2015").show()

In [None]:
waterquality_df.filter(col("LATITUDE")== "-25.73411").show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
f, axes = plt.subplots(1, 2)
sns.boxplot(y=waterquality_df.to_pandas()["TOTAL ALKALINITY"], ax=axes[0], width=.5 )
#sns.boxplot(y=waterquality_df.to_pandas()["ELECTRICAL CONDUCTANCE"], ax=axes[1] , width=.5)
#sns.boxplot(y=waterquality_df.to_pandas()["DISSOLVED REACTIVE PHOSPHORUS"], ax=axes[3] , width=.5)

#sns.boxplot(x=df_without_outlier.to_pandas()["TOTAL ALKALINITY"], ax=axes[1])
plt.show()

In [None]:
waterquality_df.filter(col("TOTAL ALKALINITY") > 320).show()
waterquality_df.filter(col("TOTAL ALKALINITY") < 60).show()


In [None]:
waterquality_train_df.describe(include="all")
#print(statistics)
#print(type(statistics))


In [None]:
from snowflake.snowpark.functions   import mean, stddev, abs, date_part


mean_value = waterquality_df.select(mean("TOTAL ALKALINITY")).collect()[0][0]
print(f"Mean Value: {mean_value}")
std_value = waterquality_df.select(stddev("TOTAL ALKALINITY")).collect()[0][0]
print(f"Standard Devidation: { std_value }")
df_without_outlier = waterquality_df.filter((abs(waterquality_df["TOTAL ALKALINITY"] - mean_value)) < (3 * std_value))
df_without_outlier.count()

In [None]:
from snowflake.snowpark.functions   import mean, stddev, abs, date_part

mean_value = waterquality_df.select(mean("ELECTRICAL CONDUCTANCE")).collect()[0][0]
print(f"Mean Value: {mean_value}")
std_value = waterquality_df.select(stddev("ELECTRICAL CONDUCTANCE")).collect()[0][0]
print(f"Standard Devidation: { std_value }")
ec_outlier = waterquality_df.filter((abs(waterquality_df["ELECTRICAL CONDUCTANCE"] - mean_value)) > (3 * std_value))
df_without_outlier.show()
df_without_outlier.count()