# Week-by-Week Aggregated Air Quality across the North-East

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from datetime import datetime, date

matplotlib.rcParams.update({
    'font.size': 13,
    'timezone': 'Europe/London'
})

## The code threw up some SettingWithCopyWarnings that I will fix at somepoint
import warnings
warnings.filterwarnings('ignore')

In [None]:
## Load in baseline and update datasets
baselineAQData = pd.read_pickle('../cache/baseline-airquality-airmon-api.pkl')
recentAQData = pd.read_pickle('../cache/update-airquality-airmon-api.pkl')

lastReading = recentAQData['Timestamp'].max()
lastReading = datetime.strptime(lastReading, '%Y-%m-%d %H:%M:%S')
print('Last data obtained %s' 
    % (lastReading.strftime('%d %B %Y %H:%M')))

In [None]:
## Pre-process Dataframes
## Baseline

## Remove suspect readings
baselineData = baselineAQData[baselineAQData["Flagged as Suspect Reading"]==False]

## Add columns with weekday and hour
baselineData.loc[:,"Weekday"] = pd.to_datetime(baselineData['Timestamp']).dt.weekday
baselineData.loc[:,"Hour"] = pd.to_datetime(baselineData['Timestamp']).dt.hour

## Remove columns unnecessary for analysis
baselineData.drop(columns=['Flagged as Suspect Reading','Location (WKT)','Ground Height Above Sea Level',
                            'Sensor Height Above Ground','Broker Name','Third Party','Sensor Centroid Longitude',
                            'Sensor Centroid Latitude','Raw ID','Units','Timestamp','Sensor Name'], axis=1, inplace=True)

In [None]:
## Pre-process Dataframes
## Recent

## Restrict to first Monday of analysis window
startDate = '2020-03-09 00:00:00'
recentData = recentAQData[recentAQData['Timestamp']>=startDate]

recentData = recentData[recentData["Flagged as Suspect Reading"]==False]

## Add columns with weekday, week number and hour
recentData.loc[:,"Weekday"] = pd.to_datetime(recentData['Timestamp']).dt.weekday
recentData.loc[:,"WeekNumber"] = pd.to_datetime(recentData['Timestamp']).dt.week
recentData.loc[:,"Hour"] = pd.to_datetime(recentData['Timestamp']).dt.hour

recentData.drop(columns=['Flagged as Suspect Reading','Location (WKT)','Ground Height Above Sea Level',
                            'Sensor Height Above Ground','Broker Name','Third Party','Sensor Centroid Longitude',
                            'Sensor Centroid Latitude','Raw ID','Timestamp','Sensor Name'], axis=1, inplace=True)

In [None]:
## List of Week Numbers
weekNumbers = recentData["WeekNumber"].unique()
## Create List of Mondays
## Something strange goes on with week numbering between .dt.week and method below - hence str(wn-1)
weekMondays = {}
for wn in weekNumbers:
    date = pd.to_datetime('2020' + str(wn-1) + '-1', format='%Y%W-%w')
    monday = date.strftime('%d-%B-%Y')
    weekMondays.update( {wn : monday} )

In [None]:
## Key Variables - easy to add more plots
variables = ['NO2','PM2.5','PM10']

In [None]:
for v in variables:
    # Select Records for Specific Variable
    dfBaseline = baselineData[baselineData["Variable"]==v]
    dfBaseline.drop(columns=['Variable'], axis=1, inplace=True)
    dfRecent = recentData[recentData["Variable"]==v]
    yaxLabel = v + ' ('+ dfRecent['Units'].iloc[0] +')'
    dfRecent.drop(columns=['Variable','Units'], axis=1, inplace=True)
    
    ## Aggregate Covid Data
    aggregateColumns = ['Weekday', 'Hour']
    baselineMean = dfBaseline.groupby(aggregateColumns, group_keys=False, as_index=False).median()
    baselineLQ = dfBaseline.groupby(aggregateColumns, group_keys=False, as_index=False).quantile(.15)
    baselineHQ = dfBaseline.groupby(aggregateColumns, group_keys=False, as_index=False).quantile(.85)
    
    ## Set up Plot/Subplot for Variable
    nrows = len(weekNumbers)
    figHeight = len(weekNumbers) * 6.5
    fig, axs = plt.subplots(nrows,1, figsize=(18,figHeight))
    row=0
    
    ## Find Max Y Value for plots
    ## Taken as Max 95 %ile across all recent data - rounded to nearest 10
    aggregateColumns = ['Weekday', 'Hour']
    maxY = dfRecent.groupby(aggregateColumns, group_keys=False, as_index=False).quantile(.95)
    maxYRound = round(maxY["Value"].max() / 10.0) * 10

    for wk in weekNumbers:
        wkRecentData = dfRecent[recentData['WeekNumber']==wk]
        mainTitle = "Week Starting " + weekMondays[wk]
 
        ## Aggregate Covid Data
        aggregateColumns = ['Weekday', 'Hour']
        recentMean = wkRecentData.groupby(aggregateColumns, group_keys=False, as_index=False).median()
        recentLQ = wkRecentData.groupby(aggregateColumns, group_keys=False, as_index=False).quantile(.15)
        recentHQ = wkRecentData.groupby(aggregateColumns, group_keys=False, as_index=False).quantile(.85)
        
        plt.axes(axs[row])
        plt.title(mainTitle, fontsize=12)
        plt.xlim(0,167)
        plt.ylim(0,maxYRound)
        plt.xlabel('Day/Hour')
        plt.xticks(ticks=[0,12,24,36,48,60,72,84,96,108,120,132,144,156,168], 
                   labels=['Mon 00','Mon 12','Tues 00','Tues 12','Wed 00','Wed 12','Thurs 00','Thurs 12',
                           'Fri 00','Fri 12','Sat 00','Sat 12','Sun 00','Sun 12'])
        plt.ylabel(yaxLabel)
        
        ## Plot Quantiles 
        plt.fill_between(x=baselineLQ.index,y1=baselineLQ['Value'],y2=baselineHQ['Value'],
                                                    color ="#f64a8a",alpha=0.2,linewidth=0, 
                                                    label='15 to 85%ile: Nov 2019 - Feb 2020')
        
        # Quartile range for March data - turned off  for now, maybe turn on as lockdown continues
        plt.fill_between(x=recentLQ.index,y1=recentLQ['Value'],y2=recentHQ['Value'],color = "#233067",
                            alpha=0.2,linewidth=0,label="15 to 85%ile: NE Stations: Week "+weekMondays[wk])
        
        ## Plot Median Lines
        plt.plot(baselineMean.index,baselineMean["Value"], color = "#f64a8a",linestyle=':',alpha=0.4,
                                  label="Median: Nov 2019 - Feb 2020")
        plt.plot(recentMean["Value"], color = "#233067",label="Median: NE Stations: Week "+weekMondays[wk])

        plt.legend(loc=0, prop={'size': 9})
    
        row = row+1
    
    plt.suptitle('North-East Aggregated Air Quality - ' + v, x=0.5,y=0.91, fontsize='15', fontweight='bold')
    
    plt.figtext(
                0.09,
                0.09,
                'Urban Observatory (https://www.urbanobservatory.ac.uk/).\n'
                'Miles Clement <m.a.clement2@ncl.ac.uk>.',
                horizontalalignment='left',
                color='#606060',
                fontdict={'size': 11}
            )
    var = v.lower()
    timestamp = str(datetime.now().strftime("%d-%m"))
    plt.savefig('../output/airquality-aggregated-' + var + '-' + timestamp + '.png', bbox_inches='tight')
    plt.show()