# Air Quality over Time - Working with dates and times

For this module, we will be exploring the techniques for working with dates and times. 

You will learn about:

* Parsing Dates
* Cleaning up data
* Fixing "Numerical" Strings
* Making functions
* Timezones
* Resampling
* Rolling operations
* Plotting

# Air Quality Data

https://archive.ics.uci.edu/ml/datasets/Air+Quality

0. Date (DD/MM/YYYY)
1. Time (HH.MM.SS)
2. True hourly averaged concentration CO in mg/m^3 (reference analyzer)
3. PT08.S1 (tin oxide) hourly averaged sensor response (nominally CO targeted)
4. True hourly averaged overall Non Metanic HydroCarbons concentration in microg/m^3 (reference analyzer)
5. True hourly averaged Benzene concentration in microg/m^3 (reference analyzer)
6. PT08.S2 (titania) hourly averaged sensor response (nominally NMHC targeted)
7. True hourly averaged NOx concentration in ppb (reference analyzer)
8. PT08.S3 (tungsten oxide) hourly averaged sensor response (nominally NOx targeted)
9. True hourly averaged NO2 concentration in microg/m^3 (reference analyzer)
10. PT08.S4 (tungsten oxide) hourly averaged sensor response (nominally NO2 targeted)
11. PT08.S5 (indium oxide) hourly averaged sensor response (nominally O3 targeted)
12. Temperature in Â°C
13. Relative Humidity (%)
14. AH Absolute Humidity

In [None]:
# Expect an error (for demonstration purposes)
# ValueError: Found non-unique column index

import numpy as np
import pandas as pd
import zipfile
with zipfile.ZipFile('data/AirQualityUCI.zip') as zip:
  df = pd.read_csv(zip.open('AirQualityUCI.csv'), sep=';',
                  dtype_backend='pyarrow', engine='pyarrow')
  
df

In [None]:
with zipfile.ZipFile('data/AirQualityUCI.zip') as zip:
  df = pd.read_csv(zip.open('AirQualityUCI.csv'), sep=';',
                  dtype_backend='pyarrow', #engine='pyarrow',
                  )

df

## Clean up Data

In [None]:
(df
 .isna()
 .sum(axis='columns')
)

In [None]:
df.shape

In [None]:
(df
 .loc[df.isna().sum(axis='columns') < 17]
)

## Date Parsing

In [None]:
(df
 .loc[df.isna().sum(axis='columns') < 17]
 .assign(date=lambda df_: (df_.Date + ' ' + \
                    df_.Time.replace(r'\.', ':', regex=True)))
 .loc[:, ['Date', 'Time', 'date']]
)

In [None]:
# Expect an error (for demonstration purposes)
# ValueError: time data "13/03/2004 00:00:00" doesn't match format "%m/%d/%Y %H:%M:%S":

(df
 .loc[df.isna().sum(axis='columns') < 17]
 .assign(date=lambda df_: pd.to_datetime(df_.Date + ' ' + \
                    df_.Time.replace(r'\.', ':', regex=True)))
)

In [None]:
(df
.Date
.value_counts())

In [None]:
(df
 .loc[df.isna().sum(axis='columns') < 17]
 .assign(date=lambda df_: pd.to_datetime(df_.Date + ' ' + \
                    df_.Time.replace(r'\.', ':', regex=True),
                                        dayfirst=True))
)

In [None]:
# Alternate
(df
 .loc[df.isna().sum(axis='columns') < 17]
 .assign(date=lambda df_: pd.to_datetime(df_.Date + ' ' + \
                    df_.Time.replace(r'\.', ':', regex=True),
                    format='%d/%m/%Y %H:%M:%S'))
)

In [None]:
(df
 .loc[df.isna().sum(axis='columns') < 17]
 .assign(date=lambda df_: pd.to_datetime(df_.Date + ' ' + \
                    df_.Time.replace(r'\.', ':', regex=True),
                    format='%d/%m/%Y %H:%M:%S'))
 .dtypes
)

## Rename Columns

In [None]:
(df
 .loc[df.isna().sum(axis='columns') < 17]
 .assign(date=lambda df_: pd.to_datetime(df_.Date + ' ' + \
                    df_.Time.replace(r'\.', ':', regex=True),
                    format='%d/%m/%Y %H:%M:%S'))
 .rename(columns={'CO(GT)': 'carbon_monoxide',
                  'PT08.S1(CO)': 'tin_oxide',
                  'NMHC(GT)': 'non_metallic_hydro_carb',
                  'C6H6(GT)': 'benzene',
                  'PT08.S2(NMHC)': 'titania',
                  'NOx(GT)': 'nox',
                  'PT08.S3(NOx)': 'tungsten_oxide_nox',
                  'NO2(GT)': 'no2',
                  'PT08.S4(NO2)':  'tungsten_oxide_no2',
                  'PT08.S5(O3)': 'indium_oxide',
                  'T': 'temp',
                  'RH': 'rel_humid',
                  'AH': 'abs_humid'})
 
 .columns
)

In [None]:
df['Unnamed: 15'].value_counts()

In [None]:
df['Unnamed: 16'].value_counts()

In [None]:
(df
 .loc[df.isna().sum(axis='columns') < 17]
 .assign(date=lambda df_: pd.to_datetime(df_.Date + ' ' + \
                    df_.Time.replace(r'\.', ':', regex=True),
                    format='%d/%m/%Y %H:%M:%S'))
 .rename(columns={'CO(GT)': 'carbon_monoxide',
                  'PT08.S1(CO)': 'tin_oxide',
                  'NMHC(GT)': 'non_metallic_hydro_carb',
                  'C6H6(GT)': 'benzene',
                  'PT08.S2(NMHC)': 'titania',
                  'NOx(GT)': 'nox',
                  'PT08.S3(NOx)': 'tungsten_oxide_nox',
                  'NO2(GT)': 'no2',
                  'PT08.S4(NO2)':  'tungsten_oxide_no2',
                  'PT08.S5(O3)': 'indium_oxide',
                  'T': 'temp',
                  'RH': 'rel_humid',
                  'AH': 'abs_humid'})
 .loc[:, ['carbon_monoxide', 'tin_oxide',
       'non_metallic_hydro_carb', 'benzene', 'titania', 'nox',
       'tungsten_oxide_nox', 'no2', 'tungsten_oxide_no2', 'indium_oxide',
       'temp', 'rel_humid', 'abs_humid', 'date']]
)

## Fixing Numerical "Strings"

In [None]:
(df
 .loc[df.isna().sum(axis='columns') < 17]
 .assign(date=lambda df_: pd.to_datetime(df_.Date + ' ' + \
                    df_.Time.replace(r'\.', ':', regex=True),
                    format='%d/%m/%Y %H:%M:%S'))
 .rename(columns={'CO(GT)': 'carbon_monoxide',
                  'PT08.S1(CO)': 'tin_oxide',
                  'NMHC(GT)': 'non_metallic_hydro_carb',
                  'C6H6(GT)': 'benzene',
                  'PT08.S2(NMHC)': 'titania',
                  'NOx(GT)': 'nox',
                  'PT08.S3(NOx)': 'tungsten_oxide_nox',
                  'NO2(GT)': 'no2',
                  'PT08.S4(NO2)':  'tungsten_oxide_no2',
                  'PT08.S5(O3)': 'indium_oxide',
                  'T': 'temp',
                  'RH': 'rel_humid',
                  'AH': 'abs_humid'})
 .loc[:, ['carbon_monoxide', 'tin_oxide',
       'non_metallic_hydro_carb', 'benzene', 'titania', 'nox',
       'tungsten_oxide_nox', 'no2', 'tungsten_oxide_no2', 'indium_oxide',
       'temp', 'rel_humid', 'abs_humid', 'date']]
 .select_dtypes('string')
)

In [None]:
(df
 .loc[df.isna().sum(axis='columns') < 17]
 .assign(date=lambda df_: pd.to_datetime(df_.Date + ' ' + \
                    df_.Time.replace(r'\.', ':', regex=True),
                    format='%d/%m/%Y %H:%M:%S'))
 .rename(columns={'CO(GT)': 'carbon_monoxide',
                  'PT08.S1(CO)': 'tin_oxide',
                  'NMHC(GT)': 'non_metallic_hydro_carb',
                  'C6H6(GT)': 'benzene',
                  'PT08.S2(NMHC)': 'titania',
                  'NOx(GT)': 'nox',
                  'PT08.S3(NOx)': 'tungsten_oxide_nox',
                  'NO2(GT)': 'no2',
                  'PT08.S4(NO2)':  'tungsten_oxide_no2',
                  'PT08.S5(O3)': 'indium_oxide',
                  'T': 'temp',
                  'RH': 'rel_humid',
                  'AH': 'abs_humid'})
 .loc[:, ['carbon_monoxide', 'tin_oxide',
       'non_metallic_hydro_carb', 'benzene', 'titania', 'nox',
       'tungsten_oxide_nox', 'no2', 'tungsten_oxide_no2', 'indium_oxide',
       'temp', 'rel_humid', 'abs_humid', 'date']]
 .pipe(lambda df_: df_.assign(**{col: df_[col].str.replace(',','.').astype('float[pyarrow]')
                                for col in
      ['carbon_monoxide', 'benzene', 'temp', 'rel_humid', 'abs_humid']}))
)

In [None]:
(df
 .loc[df.isna().sum(axis='columns') < 17]
 .assign(date=lambda df_: pd.to_datetime(df_.Date + ' ' + \
                    df_.Time.replace(r'\.', ':', regex=True),
                    format='%d/%m/%Y %H:%M:%S'))
 .rename(columns={'CO(GT)': 'carbon_monoxide',
                  'PT08.S1(CO)': 'tin_oxide',
                  'NMHC(GT)': 'non_metallic_hydro_carb',
                  'C6H6(GT)': 'benzene',
                  'PT08.S2(NMHC)': 'titania',
                  'NOx(GT)': 'nox',
                  'PT08.S3(NOx)': 'tungsten_oxide_nox',
                  'NO2(GT)': 'no2',
                  'PT08.S4(NO2)':  'tungsten_oxide_no2',
                  'PT08.S5(O3)': 'indium_oxide',
                  'T': 'temp',
                  'RH': 'rel_humid',
                  'AH': 'abs_humid'})
 .loc[:, ['carbon_monoxide', 'tin_oxide',
       'non_metallic_hydro_carb', 'benzene', 'titania', 'nox',
       'tungsten_oxide_nox', 'no2', 'tungsten_oxide_no2', 'indium_oxide',
       'temp', 'rel_humid', 'abs_humid', 'date']]
 .pipe(lambda df_: df_.assign(**{col: df_[col].str.replace(',','.').astype('float[pyarrow]')
                                for col in
      ['carbon_monoxide', 'benzene', 'temp', 'rel_humid', 'abs_humid']}))
 .dtypes
)

## Make a Function

In [None]:
import numpy as np
import pandas as pd

import zipfile
with zipfile.ZipFile('data/AirQualityUCI.zip') as zip:
  df = pd.read_csv(zip.open('AirQualityUCI.csv'), sep=';',
                  dtype_backend='pyarrow')
  
df

In [None]:
def tweak_air_qual(df):
  return (df
     .loc[df.isna().sum(axis='columns') < 17]
     .assign(date=lambda df_: pd.to_datetime(df_.Date + ' ' + \
                        df_.Time.replace(r'\.', ':', regex=True),
                        format='%d/%m/%Y %H:%M:%S'))
     .rename(columns={'CO(GT)': 'carbon_monoxide',
                      'PT08.S1(CO)': 'tin_oxide',
                      'NMHC(GT)': 'non_metallic_hydro_carb',
                      'C6H6(GT)': 'benzene',
                      'PT08.S2(NMHC)': 'titania',
                      'NOx(GT)': 'nox',
                      'PT08.S3(NOx)': 'tungsten_oxide_nox',
                      'NO2(GT)': 'no2',
                      'PT08.S4(NO2)':  'tungsten_oxide_no2',
                      'PT08.S5(O3)': 'indium_oxide',
                      'T': 'temp',
                      'RH': 'rel_humid',
                      'AH': 'abs_humid'})
     .loc[:, ['carbon_monoxide', 'tin_oxide',
           'non_metallic_hydro_carb', 'benzene', 'titania', 'nox',
           'tungsten_oxide_nox', 'no2', 'tungsten_oxide_no2', 'indium_oxide',
           'temp', 'rel_humid', 'abs_humid', 'date']]
     .pipe(lambda df_: df_.assign(**{col: df_[col].str.replace(',','.').astype('float[pyarrow]')
                                    for col in
          ['carbon_monoxide', 'benzene', 'temp', 'rel_humid', 'abs_humid']}))
    )

air_df = tweak_air_qual(df)  

In [None]:
air_df

## Convert to UTC

In [None]:
air_df.date

In [None]:
air_df.date.dt.tz_localize(tz='UTC')

In [None]:
air_df.date.convert_dtypes('timestamp[ns][pyarrow]').dt.tz_localize('UTC')

## Convert to Italian Time

In [None]:
import pytz

In [None]:
pytz.all_timezones

In [None]:
print([tz for tz in pytz.all_timezones if 'Europe' in tz])

In [None]:
# Expect an error (for demonstration purposes)
# NonExistentTimeError: 2004-03-28 02:00:00

air_df.date.dt.tz_localize('Europe/Rome')

In [None]:
air_df.date.dt.tz_localize(tz='UTC').dt.tz_convert('Europe/Rome')

In [None]:
(air_df
 .date
 .convert_dtypes('timestamp[ns][pyarrow]')
 .dt.tz_localize('UTC')
 .dt.tz_convert('Europe/Rome')
)

In [None]:
# Note that CET works but isn't a timezone
air_df.date.dt.tz_localize(tz='UTC').dt.tz_convert('CET')

## Plotting

In [None]:
(air_df
 .assign(date=air_df.date.dt.tz_localize(tz='UTC').dt.tz_convert('Europe/Rome'))
 .set_index('date')
 .loc[:, ['carbon_monoxide', 'benzene']]
)

In [None]:
(air_df
 .assign(date=air_df.date.convert_dtypes('timestamp[ns][pyarrow]').dt.tz_localize(tz='UTC').dt.tz_convert('Europe/Rome'))
 .set_index('date')
 .loc[:, ['carbon_monoxide', 'benzene']]
)

In [None]:
(air_df
 .assign(date=air_df.date.dt.tz_localize(tz='UTC').dt.tz_convert('Europe/Rome'))
 .set_index('date')
 .loc[:, ['carbon_monoxide', 'benzene']]
 .plot(figsize=(8,3))
)

In [None]:
(air_df
 .assign(date=air_df.date.dt.tz_localize(tz='UTC').dt.tz_convert('Europe/Rome'))
 .set_index('date')
 .loc['2004/04':'2004/05', ['carbon_monoxide', 'benzene']]
 .plot(figsize=(8,3))
)

In [None]:
(air_df
 .assign(date=air_df.date.dt.tz_localize(tz='UTC').dt.tz_convert('Europe/Rome'))
 .set_index('date')
 .loc['2004/04':'2004/05', ['carbon_monoxide', 'benzene']] 
 .replace(-200, np.nan)
 .plot(figsize=(8,3)) 
)

In [None]:
# zoom in a little more
(air_df
 .assign(date=air_df.date.dt.tz_localize(tz='UTC').dt.tz_convert('Europe/Rome'))
 .set_index('date')
 .loc['2004/04/08':'2004/04/13', ['carbon_monoxide', 'benzene']] 
 .replace(-200, np.nan)
 .plot(figsize=(8,3)) 
)

## Missing Values

In [None]:
(air_df
 .assign(date=air_df.date.dt.tz_localize(tz='UTC').dt.tz_convert('Europe/Rome'))
 .set_index('date')
 .loc['2004/04/08':'2004/04/13', ['carbon_monoxide', 'benzene']] 
 .replace(-200, np.nan)
 .plot(figsize=(8,3))
)

In [None]:
# Expect an error (for demonstration purposes)
# TypeError: ExtensionArray.interpolate() missing 1 required keyword-only argument: 'fill_value'

(air_df
 .assign(date=air_df.date.dt.tz_localize(tz='UTC').dt.tz_convert('Europe/Rome'))
 .set_index('date')
 .loc['2004/04/08':'2004/04/13', ['carbon_monoxide', 'benzene']] 
 .replace(-200, np.nan)
 .interpolate()
 .plot(figsize=(8,3))
)

In [None]:
(air_df
 .assign(date=air_df.date.dt.tz_localize(tz='UTC').dt.tz_convert('Europe/Rome'))
 .set_index('date')
 .loc['2004/04/08':'2004/04/13', ['carbon_monoxide', 'benzene']] 
 .replace(-200, np.nan)
 .astype(float)
 .interpolate()
 .plot(figsize=(8,3))
)

In [None]:
(air_df
 .assign(date=air_df.date.dt.tz_localize(tz='UTC').dt.tz_convert('Europe/Rome'))
 .set_index('date')
 .loc['2004/04/08':'2004/04/13', ['carbon_monoxide', 'benzene']] 
 .replace(-200, np.nan)
 .ffill()
 .plot(figsize=(8,3))
)

In [None]:
(air_df
 .assign(date=air_df.date.dt.tz_localize(tz='UTC').dt.tz_convert('Europe/Rome'))
 .set_index('date')
 .loc['2004/04/08':'2004/04/13', ['carbon_monoxide', 'benzene']] 
 .replace(-200, np.nan)
 .bfill()
 .plot(figsize=(8,3))
)

In [None]:
(air_df
 .assign(date=air_df.date.dt.tz_localize(tz='UTC').dt.tz_convert('Europe/Rome'))
 .set_index('date')
 .loc['2004/04/08':'2004/04/13', ['carbon_monoxide', 'benzene']] 
 .replace(-200, np.nan)
 .fillna(22)
 .plot(figsize=(8,3))
)

In [None]:
(air_df
 .assign(date=air_df.date.dt.tz_localize(tz='UTC').dt.tz_convert('Europe/Rome'))
 .set_index('date')
 .loc['2004/04/08':'2004/04/13', ['carbon_monoxide', 'benzene']] 
 .replace(-200, np.nan)
 .pipe(lambda df_: df_.fillna(df_.mean()))
 .plot(figsize=(8,3))
)

## Resampling

In [None]:
(air_df
 .assign(date=air_df.date.dt.tz_localize(tz='UTC').dt.tz_convert('Europe/Rome'))
 .set_index('date')
 .loc['2004/04/08':'2004/04/13', ['carbon_monoxide', 'benzene']] 
 .replace(-200, np.nan)
 .astype(float)
 .interpolate()
 #.plot(figsize=(8,3))
)

In [None]:
(air_df
 .assign(date=air_df.date.dt.tz_localize(tz='UTC').dt.tz_convert('Europe/Rome'))
 .set_index('date')
 .loc['2004/04/08':'2004/04/13', ['carbon_monoxide', 'benzene']] 
 .replace(-200, np.nan)
 .astype(float)
 .interpolate()
 .resample('5h')
 .mean()
 .plot(figsize=(8,3))
)

In [None]:
(air_df
 .assign(date=air_df.date.dt.tz_localize(tz='UTC').dt.tz_convert('Europe/Rome'))
 .set_index('date')
 .loc['2004/04/08':'2004/04/13', ['carbon_monoxide', 'benzene']] 
 .replace(-200, np.nan)
 .astype(float)
 .interpolate()
 .resample('2h37min')
 .mean()
)

In [None]:
(air_df
 .assign(date=air_df.date.dt.tz_localize(tz='UTC').dt.tz_convert('Europe/Rome'))
 .set_index('date')
 .loc['2004/04/08':'2004/04/13', ['carbon_monoxide', 'benzene']] 
 .replace(-200, np.nan)
 .astype(float)
 .interpolate()
 .resample('2h37min')
 .mean()
 .plot(figsize=(8,3)) 
)

In [None]:
(air_df
 .assign(date=air_df.date.dt.tz_localize(tz='UTC').dt.tz_convert('Europe/Rome'))
 .set_index('date')
 .loc['2004/04/08':'2004/05/13', ['carbon_monoxide', 'benzene']] 
 .replace(-200, np.nan)
 .astype(float)
 .interpolate()
 .resample('1d')
 .mean()
 .plot()
)

In [None]:
# Note: You can also use pd.Grouper and .groupby
(air_df
 .assign(date=air_df.date.dt.tz_localize(tz='UTC').dt.tz_convert('Europe/Rome'))
 #.set_index('date')
 #.loc['2004/04/08':'2004/05/13', ['carbon_monoxide', 'benzene']] 
 .loc[:, ['date', 'carbon_monoxide', 'benzene']] 
 .replace(-200, np.nan)
# .astype(float)
 .astype({'carbon_monoxide': float, 'benzene': float})
 .interpolate()
# #.resample('1d')
 .groupby(pd.Grouper(key='date', freq='1d'))
 .mean()
 .loc['2004/04/08':'2004/05/13']
 .plot()
)

## Rolling

In [None]:
(air_df
 .assign(date=air_df.date.dt.tz_localize(tz='UTC').dt.tz_convert('Europe/Rome'))
 .set_index('date')
 .loc['2004/04/08':'2005/05/13', ['carbon_monoxide', 'benzene']] 
 .replace(-200, np.nan)
 .astype(float)
 .interpolate()  #.interpolate works with Pandas 1.x
 .plot()
)

In [None]:
(air_df
 .assign(date=air_df.date.dt.tz_localize(tz='UTC').dt.tz_convert('Europe/Rome'))
 .set_index('date')
 .loc['2004/04/08':'2005/05/13', ['carbon_monoxide', 'benzene']] 
 .replace(-200, np.nan)
 .astype(float)
 .interpolate()
 .resample('d')
 .mean()
 .interpolate()
 .plot()
)

In [None]:
(air_df
 .assign(date=air_df.date.dt.tz_localize(tz='UTC').dt.tz_convert('Europe/Rome'))
 .set_index('date')
 .loc['2004/04/08':'2005/05/13', ['carbon_monoxide', 'benzene']] 
 .replace(-200, np.nan)
 .astype(float)
 .interpolate()
 .resample('d')
 .mean()
 .interpolate()
 .rolling(7)
 .mean()
 .plot()
)

## Fix the Function

In [None]:
# fix the -200 values
def tweak_air_qual(df):
  return (df
   .loc[df.isna().sum(axis='columns') < 17]
   .assign(date=lambda df_: pd.to_datetime(df_.Date + ' ' + \
                      df_.Time.replace(r'\.', ':', regex=True),
                                          dayfirst=True))
   .rename(columns={'CO(GT)': 'carbon_monoxide',
                    'PT08.S1(CO)': 'tin_oxide',
                    'NMHC(GT)': 'non_metallic_hydro_carb',
                    'C6H6(GT)': 'benzene',
                    'PT08.S2(NMHC)': 'titania',
                    'NOx(GT)': 'nox',
                    'PT08.S3(NOx)': 'tungsten_oxide_nox',
                    'NO2(GT)': 'no2',
                    'PT08.S4(NO2)':  'tungsten_oxide_no2',
                    'PT08.S5(O3)': 'indium_oxide',
                    'T': 'temp',
                    'RH': 'rel_humid',
                    'AH': 'abs_humid'})
   .loc[:, ['carbon_monoxide', 'tin_oxide',
         'non_metallic_hydro_carb', 'benzene', 'titania', 'nox',
         'tungsten_oxide_nox', 'no2', 'tungsten_oxide_no2', 'indium_oxide',
         'temp', 'rel_humid', 'abs_humid', 'date']]
   .pipe(lambda df_: df_.assign(**{col: df_[col].str.replace(',','.').astype(float)
                                  for col in
        ['carbon_monoxide', 'benzene', 'temp', 'rel_humid', 'abs_humid']}))
   .replace(-200, np.nan)
  )

with zipfile.ZipFile('data/AirQualityUCI.zip') as zip:
  df = pd.read_csv(zip.open('AirQualityUCI.csv'), sep=';',
                  dtype_backend='pyarrow', #engine='pyarrow',
                  )

air_df = tweak_air_qual(df)  

In [None]:
air_df