# agg_ag_data_sql

### Summary:
Tests and SQL scripts to produce weekly aggregates of the AQ measurements produced by the AQ monitors in the ClassAct study.

In [55]:
import pandas as pd
from FDMBuilder.FDM_helpers import *

In [118]:
def return_agg_select_statements(col_name):
    min_sql = f"MIN({col_name}) as {col_name}_min"
    max_sql = f"MAX({col_name}) as {col_name}_max"
    mean_sql = f"AVG({col_name}) as {col_name}_mean"
    median_sql = f"APPROX_QUANTILES({col_name}, 10)[OFFSET(5)] AS {col_name}_median"
    sd_sql = f"STDDEV({col_name}) AS {col_name}_sd"
    pctl_90_sql = f"APPROX_QUANTILES({col_name}, 10)[OFFSET(9)] AS {col_name}_pctl_90"
    pctl_10_sql = f"APPROX_QUANTILES({col_name}, 10)[OFFSET(1)] AS {col_name}_pctl_10"
    statements = [min_sql, max_sql, mean_sql, median_sql, sd_sql, pctl_90_sql, 
                  pctl_10_sql]
    return ", ".join(statements)

Need to remove errors as per Stavros logic - for the moment use Stavros' day table and aggregate to weeks:

In [44]:
%%bigquery test
SELECT *
FROM `yhcr-prd-phm-bia-core.CY_CLASS_ACT.df_AIRQ`
WHERE date < 18880 AND occupied = TRUE

Query complete after 0.00s: 100%|██████████| 2/2 [00:00<00:00, 1004.38query/s]                        
Downloading: 100%|██████████| 562456/562456 [00:01<00:00, 443576.16rows/s]


In [91]:
test.Temp

0         23.5
1         23.7
2         23.8
3         24.0
4         24.0
          ... 
562451    22.2
562452    23.1
562453    23.4
562454    24.2
562455    28.6
Name: Temp, Length: 562456, dtype: float64

In [108]:
agg_df = (test[["Date", "School_ID", "Classroom", "CO2"]]
 .dropna(subset="CO2")
 .groupby(["Date", "School_ID", "Classroom"])
 .count()
 .reset_index()
 .sort_values(["Date", "School_ID", "Classroom"]))
agg_df[agg_df.CO2 < 350]

Unnamed: 0,Date,School_ID,Classroom,CO2
0,18876,C01,1,79
1,18876,C01,2,79
2,18876,C01,3,78
3,18876,C01,4,79
4,18876,C01,5,79
...,...,...,...,...
1236,18879,UV03,12,303
1237,18879,UV05,1,314
1238,18879,UV05,2,308
1239,18879,UV05,3,307


In [112]:
%%bigquery result
WITH tab AS (
    SELECT date, school_id, classroom, 
        COUNT(CO2) OVER(PARTITION BY date, school_id, classroom) AS readings,
        COUNT(CO2) OVER(PARTITION BY date, school_id, classroom) < 350 AS readings_below_threshold,
    FROM `yhcr-prd-phm-bia-core.CY_CLASS_ACT.df_AIRQ`
    WHERE date < 18880 AND occupied
)
SELECT * FROM tab
WHERE readings_below_threshold 

Query complete after 0.00s: 100%|██████████| 5/5 [00:00<00:00, 2416.91query/s]                        
Downloading: 100%|██████████| 213447/213447 [00:00<00:00, 293497.70rows/s]


In [113]:
result[result.readings > 0].drop_duplicates(subset=["date", "school_id", "classroom"]).sort_values(["date", "school_id", "classroom"])

Unnamed: 0,date,school_id,classroom,readings,readings_below_threshold
164190,18876,C01,1,79,True
2526,18876,C01,2,79,True
21892,18876,C01,3,78,True
167137,18876,C01,4,79,True
172610,18876,C01,5,79,True
...,...,...,...,...,...
114933,18879,UV03,12,303,True
56835,18879,UV05,1,314,True
208816,18879,UV05,2,308,True
58940,18879,UV05,3,307,True


In [119]:
table_location = "yhcr-prd-phm-bia-core.CY_CLASS_ACT.df_AIRQ"
aq_measures = ["TVOC", "CO2", "PM1", "PM2_5", "PM4", "PM10", "Temp", "RH"]
agg_aq_sql = ", ".join([return_agg_select_statements(measure)
                        for measure in aq_measures])
week_agg_sql = f"""
    WITH AIRQ_w_conditions AS (
        SELECT *,
            COUNT(CO2) OVER(PARTITION BY date, school_id, classroom) < 350 AS readings_below_threshold,
            AVG(CO2) OVER(PARTITION BY date, school_id, classroom) >= 3500 AS over_CO2_threshold,
            AVG(PM10) OVER(PARTITION BY date, school_id, classroom) >= 3500 AS over_mean_PM10_threshold,
            MAX(PM10) OVER(PARTITION BY date, school_id, classroom) >= 10000 AS over_max_PM10_threshold,
            AVG(temp) OVER(PARTITION BY date, school_id, classroom) >= 37.5 AS over_temp_threshold,
        FROM `yhcr-prd-phm-bia-core.CY_CLASS_ACT.df_AIRQ`
        WHERE occupied 
    )
    SELECT School_ID, DATE(MIN(TIMESTAMP_SECONDS(DateTime))) AS week_start,
        {agg_aq_sql},
    FROM AIRQ_w_conditions
    GROUP BY School_ID, YearWeek
    WHERE NOT readings_below_threshold AND NOT over_CO2_threshold 
        AND NOT over_mean_pm10_threshold AND NOT over_max_PM10_threshold 
        AND NOT over_temp_threshold
"""

print(week_agg_sql)


    WITH AIRQ_w_conditions AS (
        SELECT *,
            COUNT(CO2) OVER(PARTITION BY date, school_id, classroom) < 350 AS readings_below_threshold,
            AVG(CO2) OVER(PARTITION BY date, school_id, classroom) >= 3500 AS over_CO2_threshold,
            AVG(PM10) OVER(PARTITION BY date, school_id, classroom) >= 3500 AS over_mean_PM10_threshold,
            MAX(PM10) OVER(PARTITION BY date, school_id, classroom) >= 10000 AS over_max_PM10_threshold,
            AVG(temp) OVER(PARTITION BY date, school_id, classroom) >= 37.5 AS over_temp_threshold,
        FROM `yhcr-prd-phm-bia-core.CY_CLASS_ACT.df_AIRQ`
        WHERE occupied 
    )
    SELECT School_ID, DATE(MIN(TIMESTAMP_SECONDS(DateTime))) AS week_start,
        MIN(TVOC) as TVOC_min, MAX(TVOC) as TVOC_max, AVG(TVOC) as TVOC_mean, APPROX_QUANTILES(TVOC, 10)[OFFSET(5)] AS TVOC_median, STDDEV(TVOC) AS TVOC_sd, APPROX_QUANTILES(TVOC, 10)[OFFSET(9)] AS TVOC_pctl_90, APPROX_QUANTILES(TVOC, 10)[OFFSET(1)] AS TVOC_pctl_10, MIN(CO

In [129]:
%%bigquery 

CREATE VIEW `yhcr-prd-phm-bia-core.CY_CLASS_ACT.stats_week_school` AS
WITH AIRQ_w_conditions AS (
    SELECT *,
        COUNT(CO2) OVER(PARTITION BY date, school_id, classroom) < 350 AS readings_below_threshold,
        AVG(CO2) OVER(PARTITION BY date, school_id, classroom) >= 3500 AS over_CO2_threshold,
        AVG(PM10) OVER(PARTITION BY date, school_id, classroom) >= 3500 AS over_mean_PM10_threshold,
        MAX(PM10) OVER(PARTITION BY date, school_id, classroom) >= 10000 AS over_max_PM10_threshold,
        AVG(temp) OVER(PARTITION BY date, school_id, classroom) >= 37.5 AS over_temp_threshold,
    FROM `yhcr-prd-phm-bia-core.CY_CLASS_ACT.df_AIRQ`
    WHERE occupied 
)
SELECT School_ID, DATE(MIN(TIMESTAMP_SECONDS(DateTime))) AS week_start,
    MIN(TVOC) as TVOC_min, MAX(TVOC) as TVOC_max, AVG(TVOC) as TVOC_mean, 
    APPROX_QUANTILES(TVOC, 10)[OFFSET(5)] AS TVOC_median, STDDEV(TVOC) AS TVOC_sd, 
    APPROX_QUANTILES(TVOC, 10)[OFFSET(9)] AS TVOC_pctl_90, 
    APPROX_QUANTILES(TVOC, 10)[OFFSET(1)] AS TVOC_pctl_10, MIN(CO2) as CO2_min, 
    MAX(CO2) as CO2_max, AVG(CO2) as CO2_mean, 
    APPROX_QUANTILES(CO2, 10)[OFFSET(5)] AS CO2_median, STDDEV(CO2) AS CO2_sd, 
    APPROX_QUANTILES(CO2, 10)[OFFSET(9)] AS CO2_pctl_90, 
    APPROX_QUANTILES(CO2, 10)[OFFSET(1)] AS CO2_pctl_10, MIN(PM1) as PM1_min, 
    MAX(PM1) as PM1_max, AVG(PM1) as PM1_mean, 
    APPROX_QUANTILES(PM1, 10)[OFFSET(5)] AS PM1_median, STDDEV(PM1) AS PM1_sd, 
    APPROX_QUANTILES(PM1, 10)[OFFSET(9)] AS PM1_pctl_90, 
    APPROX_QUANTILES(PM1, 10)[OFFSET(1)] AS PM1_pctl_10, MIN(PM2_5) as PM2_5_min, 
    MAX(PM2_5) as PM2_5_max, AVG(PM2_5) as PM2_5_mean, 
    APPROX_QUANTILES(PM2_5, 10)[OFFSET(5)] AS PM2_5_median, STDDEV(PM2_5) AS PM2_5_sd, 
    APPROX_QUANTILES(PM2_5, 10)[OFFSET(9)] AS PM2_5_pctl_90, 
    APPROX_QUANTILES(PM2_5, 10)[OFFSET(1)] AS PM2_5_pctl_10, MIN(PM4) as PM4_min, 
    MAX(PM4) as PM4_max, AVG(PM4) as PM4_mean, 
    APPROX_QUANTILES(PM4, 10)[OFFSET(5)] AS PM4_median, STDDEV(PM4) AS PM4_sd, 
    APPROX_QUANTILES(PM4, 10)[OFFSET(9)] AS PM4_pctl_90, 
    APPROX_QUANTILES(PM4, 10)[OFFSET(1)] AS PM4_pctl_10, MIN(PM10) as PM10_min, 
    MAX(PM10) as PM10_max, AVG(PM10) as PM10_mean, 
    APPROX_QUANTILES(PM10, 10)[OFFSET(5)] AS PM10_median, STDDEV(PM10) AS PM10_sd, 
    APPROX_QUANTILES(PM10, 10)[OFFSET(9)] AS PM10_pctl_90, 
    APPROX_QUANTILES(PM10, 10)[OFFSET(1)] AS PM10_pctl_10, MIN(Temp) as Temp_min, 
    MAX(Temp) as Temp_max, AVG(Temp) as Temp_mean, 
    APPROX_QUANTILES(Temp, 10)[OFFSET(5)] AS Temp_median, STDDEV(Temp) AS Temp_sd, 
    APPROX_QUANTILES(Temp, 10)[OFFSET(9)] AS Temp_pctl_90, 
    APPROX_QUANTILES(Temp, 10)[OFFSET(1)] AS Temp_pctl_10, MIN(RH) as RH_min, 
    MAX(RH) as RH_max, AVG(RH) as RH_mean, 
    APPROX_QUANTILES(RH, 10)[OFFSET(5)] AS RH_median, STDDEV(RH) AS RH_sd, 
    APPROX_QUANTILES(RH, 10)[OFFSET(9)] AS RH_pctl_90, 
    APPROX_QUANTILES(RH, 10)[OFFSET(1)] AS RH_pctl_10
FROM AIRQ_w_conditions
WHERE NOT readings_below_threshold AND NOT over_CO2_threshold 
    AND NOT over_mean_pm10_threshold AND NOT over_max_PM10_threshold 
    AND NOT over_temp_threshold
GROUP BY School_ID, YearWeek

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 1119.97query/s]


In [124]:
%%bigquery result

CREATE TABLE `yhcr-prd-phm-bia-core.CY_CLASS_ACT.stats_week_school_21_09_22` AS
SELECT *
FROM `yhcr-prd-phm-bia-core.CY_CLASS_ACT.stats_week_school` 


Query complete after 0.00s: 100%|██████████| 10/10 [00:00<00:00, 6304.38query/s]                       


In [127]:
%%bigquery result

SELECT *
FROM `yhcr-prd-phm-bia-core.CY_CLASS_ACT.stats_week_school_21_09_22`


Query complete after 0.00s: 100%|██████████| 2/2 [00:00<00:00, 965.98query/s]                         
Downloading: 100%|██████████| 1233/1233 [00:00<00:00, 1389.96rows/s]


In [128]:
result

Unnamed: 0,School_ID,week_start,TVOC_min,TVOC_max,TVOC_mean,TVOC_median,TVOC_sd,TVOC_pctl_90,TVOC_pctl_10,CO2_min,...,Temp_sd,Temp_pctl_90,Temp_pctl_10,RH_min,RH_max,RH_mean,RH_median,RH_sd,RH_pctl_90,RH_pctl_10
0,H09,2021-11-15,2.0,994.0,530.487834,534.0,287.164059,922.0,108.0,384.0,...,1.388008,23.5,20.1,41.0,73.0,59.163857,60.0,4.794400,65.0,53.0
1,H10,2021-11-15,4.0,996.0,539.586646,560.0,295.383130,926.0,116.0,369.0,...,2.144838,22.6,16.8,43.0,71.0,56.562824,56.0,6.173005,65.0,49.0
2,H03,2021-11-15,4.0,994.0,508.966878,492.0,282.614311,904.0,132.0,405.0,...,1.500243,24.3,20.5,41.0,68.0,54.734612,55.0,5.362926,61.0,47.0
3,H01,2021-11-15,0.0,1000.0,595.140237,664.0,335.544203,990.0,118.0,391.0,...,1.794608,23.7,18.9,39.0,74.0,54.088438,54.0,6.385231,63.0,46.0
4,H04,2021-11-15,12.0,1000.0,646.754355,720.0,287.580516,968.0,196.0,396.0,...,1.627774,24.0,20.4,37.0,69.0,52.675675,53.0,5.212633,59.0,46.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1228,C04,2021-11-08,0.0,1000.0,682.922897,762.0,279.441813,984.0,212.0,407.0,...,1.710891,24.0,19.6,40.0,74.0,55.227416,55.0,5.351586,62.0,48.0
1229,C03,2021-11-08,4.0,1000.0,655.026699,700.0,269.545600,966.0,234.0,398.0,...,1.553967,25.9,21.9,31.0,68.0,51.413877,52.0,5.682700,58.0,44.0
1230,C14,2021-11-08,0.0,998.0,521.500879,498.0,276.985426,912.0,164.0,413.0,...,1.406679,26.3,22.5,34.0,61.0,49.242849,49.0,3.810179,53.0,44.0
1231,UV03,2021-11-08,2.0,1000.0,648.956515,750.0,306.567359,974.0,148.0,399.0,...,1.677619,24.8,20.4,42.0,66.0,55.698680,56.0,4.133560,61.0,50.0
