## Time Series Analysis

1. Converting objects to Datetime objects
2. Extracting other time data from Datetime objects
3. Resampling data based on time series components

In [1]:
import pandas as pd
import numpy as np

In [2]:
time_df = pd.read_csv("air_quality_no2_long.csv")
time_df.head()

Unnamed: 0,city,country,date.utc,location,parameter,value,unit
0,Paris,FR,2019-06-21 00:00:00+00:00,FR04014,no2,20.0,µg/m³
1,Paris,FR,2019-06-20 23:00:00+00:00,FR04014,no2,21.8,µg/m³
2,Paris,FR,2019-06-20 22:00:00+00:00,FR04014,no2,26.5,µg/m³
3,Paris,FR,2019-06-20 21:00:00+00:00,FR04014,no2,24.9,µg/m³
4,Paris,FR,2019-06-20 20:00:00+00:00,FR04014,no2,21.4,µg/m³


In [3]:
time_df.shape

(2068, 7)

In [4]:
time_df.columns

Index(['city', 'country', 'date.utc', 'location', 'parameter', 'value',
       'unit'],
      dtype='object')

In [5]:
time_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2068 entries, 0 to 2067
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   city       2068 non-null   object 
 1   country    2068 non-null   object 
 2   date.utc   2068 non-null   object 
 3   location   2068 non-null   object 
 4   parameter  2068 non-null   object 
 5   value      2068 non-null   float64
 6   unit       2068 non-null   object 
dtypes: float64(1), object(6)
memory usage: 113.2+ KB


In [6]:
# Changing the data type of the time column to Datetime
time_df['date.utc'] = pd.to_datetime(time_df['date.utc'], format='mixed')

# mixed format, pandas is going to infer the correct format
time_df['date.utc'].dtype

datetime64[ns, UTC]

### Extracting other time data from Datetime objects

In [7]:
# Now that we have changed the data type, we can extract other components from the datetime object

time_df['Year'] = time_df['date.utc'].dt.year
time_df.head()

Unnamed: 0,city,country,date.utc,location,parameter,value,unit,Year
0,Paris,FR,2019-06-21 00:00:00+00:00,FR04014,no2,20.0,µg/m³,2019
1,Paris,FR,2019-06-20 23:00:00+00:00,FR04014,no2,21.8,µg/m³,2019
2,Paris,FR,2019-06-20 22:00:00+00:00,FR04014,no2,26.5,µg/m³,2019
3,Paris,FR,2019-06-20 21:00:00+00:00,FR04014,no2,24.9,µg/m³,2019
4,Paris,FR,2019-06-20 20:00:00+00:00,FR04014,no2,21.4,µg/m³,2019


In [9]:
time_df['Month'] = time_df['date.utc'].dt.month
time_df['Day'] = time_df['date.utc'].dt.day
time_df['Day_name'] = time_df['date.utc'].dt.day_name()

In [10]:
time_df.head()

Unnamed: 0,city,country,date.utc,location,parameter,value,unit,Year,Month,Day,Day_name
0,Paris,FR,2019-06-21 00:00:00+00:00,FR04014,no2,20.0,µg/m³,2019,6,21,Friday
1,Paris,FR,2019-06-20 23:00:00+00:00,FR04014,no2,21.8,µg/m³,2019,6,20,Thursday
2,Paris,FR,2019-06-20 22:00:00+00:00,FR04014,no2,26.5,µg/m³,2019,6,20,Thursday
3,Paris,FR,2019-06-20 21:00:00+00:00,FR04014,no2,24.9,µg/m³,2019,6,20,Thursday
4,Paris,FR,2019-06-20 20:00:00+00:00,FR04014,no2,21.4,µg/m³,2019,6,20,Thursday


### Resampling data based on time series components

In [11]:
# asfreq() - as frequent changes the content of a df without aggregating
# Only works if the datetime object is the index

time_df.set_index('date.utc', inplace=True)

In [12]:
time_df.head()

Unnamed: 0_level_0,city,country,location,parameter,value,unit,Year,Month,Day,Day_name
date.utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-06-21 00:00:00+00:00,Paris,FR,FR04014,no2,20.0,µg/m³,2019,6,21,Friday
2019-06-20 23:00:00+00:00,Paris,FR,FR04014,no2,21.8,µg/m³,2019,6,20,Thursday
2019-06-20 22:00:00+00:00,Paris,FR,FR04014,no2,26.5,µg/m³,2019,6,20,Thursday
2019-06-20 21:00:00+00:00,Paris,FR,FR04014,no2,24.9,µg/m³,2019,6,20,Thursday
2019-06-20 20:00:00+00:00,Paris,FR,FR04014,no2,21.4,µg/m³,2019,6,20,Thursday


In [13]:
# day_df = time_df.as_freq(freq='D')
paris_df = time_df.loc[time_df['city'] == "Paris"]
paris_df

Unnamed: 0_level_0,city,country,location,parameter,value,unit,Year,Month,Day,Day_name
date.utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-06-21 00:00:00+00:00,Paris,FR,FR04014,no2,20.0,µg/m³,2019,6,21,Friday
2019-06-20 23:00:00+00:00,Paris,FR,FR04014,no2,21.8,µg/m³,2019,6,20,Thursday
2019-06-20 22:00:00+00:00,Paris,FR,FR04014,no2,26.5,µg/m³,2019,6,20,Thursday
2019-06-20 21:00:00+00:00,Paris,FR,FR04014,no2,24.9,µg/m³,2019,6,20,Thursday
2019-06-20 20:00:00+00:00,Paris,FR,FR04014,no2,21.4,µg/m³,2019,6,20,Thursday
...,...,...,...,...,...,...,...,...,...,...
2019-05-07 05:00:00+00:00,Paris,FR,FR04014,no2,72.4,µg/m³,2019,5,7,Tuesday
2019-05-07 04:00:00+00:00,Paris,FR,FR04014,no2,61.9,µg/m³,2019,5,7,Tuesday
2019-05-07 03:00:00+00:00,Paris,FR,FR04014,no2,50.4,µg/m³,2019,5,7,Tuesday
2019-05-07 02:00:00+00:00,Paris,FR,FR04014,no2,27.7,µg/m³,2019,5,7,Tuesday


In [15]:
day_df = paris_df.asfreq(freq='D', method='ffill')
# ffill - forward fill: fill the value that is on top of the missing value and replace the null value
# bfill - backfill
day_df

Unnamed: 0_level_0,city,country,location,parameter,value,unit,Year,Month,Day,Day_name
date.utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-05-07 01:00:00+00:00,Paris,FR,FR04014,no2,25.0,µg/m³,2019,5,7,Tuesday
2019-05-08 01:00:00+00:00,Paris,FR,FR04014,no2,19.6,µg/m³,2019,5,8,Wednesday
2019-05-09 01:00:00+00:00,Paris,FR,FR04014,no2,10.6,µg/m³,2019,5,9,Thursday
2019-05-10 01:00:00+00:00,Paris,FR,FR04014,no2,19.1,µg/m³,2019,5,10,Friday
2019-05-11 01:00:00+00:00,Paris,FR,FR04014,no2,15.5,µg/m³,2019,5,11,Saturday
2019-05-12 01:00:00+00:00,Paris,FR,FR04014,no2,19.2,µg/m³,2019,5,12,Sunday
2019-05-13 01:00:00+00:00,Paris,FR,FR04014,no2,18.9,µg/m³,2019,5,13,Monday
2019-05-14 01:00:00+00:00,Paris,FR,FR04014,no2,19.1,µg/m³,2019,5,14,Tuesday
2019-05-15 01:00:00+00:00,Paris,FR,FR04014,no2,17.2,µg/m³,2019,5,15,Wednesday
2019-05-16 01:00:00+00:00,Paris,FR,FR04014,no2,26.0,µg/m³,2019,5,16,Thursday


In [16]:
# Resample()
# Example: We want to know the pollution every three days
three_days = paris_df.resample('3D').sum()
three_days.head()

Unnamed: 0_level_0,city,country,location,parameter,value,unit,Year,Month,Day,Day_name
date.utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-05-07 00:00:00+00:00,ParisParisParisParisParisParisParisParisParisP...,FRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFR...,FR04014FR04014FR04014FR04014FR04014FR04014FR04...,no2no2no2no2no2no2no2no2no2no2no2no2no2no2no2n...,2049.1,µg/m³µg/m³µg/m³µg/m³µg/m³µg/m³µg/m³µg/m³µg/m³µ...,143349,355,569,TuesdayTuesdayTuesdayTuesdayTuesdayTuesdayTues...
2019-05-10 00:00:00+00:00,ParisParisParisParisParisParisParisParisParisP...,FRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFR...,FR04014FR04014FR04014FR04014FR04014FR04014FR04...,no2no2no2no2no2no2no2no2no2no2no2no2no2no2no2n...,1786.0,µg/m³µg/m³µg/m³µg/m³µg/m³µg/m³µg/m³µg/m³µg/m³µ...,139311,345,759,FridayFridayFridayFridayFridayFridayFridayFrid...
2019-05-13 00:00:00+00:00,ParisParisParisParisParisParisParisParisParisP...,FRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFR...,FR04014FR04014FR04014FR04014FR04014FR04014FR04...,no2no2no2no2no2no2no2no2no2no2no2no2no2no2no2n...,1566.4,µg/m³µg/m³µg/m³µg/m³µg/m³µg/m³µg/m³µg/m³µg/m³µ...,145368,360,1008,MondayMondayMondayMondayMondayMondayMondayMond...
2019-05-16 00:00:00+00:00,ParisParisParisParisParisParisParisParisParisP...,FRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFR...,FR04014FR04014FR04014FR04014FR04014FR04014FR04...,no2no2no2no2no2no2no2no2no2no2no2no2no2no2no2n...,2050.0,µg/m³µg/m³µg/m³µg/m³µg/m³µg/m³µg/m³µg/m³µg/m³µ...,143349,355,1208,ThursdayThursdayThursdayThursdayThursdayThursd...
2019-05-19 00:00:00+00:00,ParisParisParisParisParisParisParisParisParisP...,FRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFRFR...,FR04014FR04014FR04014FR04014FR04014FR04014FR04...,no2no2no2no2no2no2no2no2no2no2no2no2no2no2no2n...,2366.5,µg/m³µg/m³µg/m³µg/m³µg/m³µg/m³µg/m³µg/m³µg/m³µ...,145368,360,1440,SundaySundaySundaySundaySundaySundaySundaySund...


In [17]:
# To sum only numeric values
three_days = paris_df.resample('3D').sum(numeric_only=True)
three_days

Unnamed: 0_level_0,value,Year,Month,Day
date.utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-05-07 00:00:00+00:00,2049.1,143349,355,569
2019-05-10 00:00:00+00:00,1786.0,139311,345,759
2019-05-13 00:00:00+00:00,1566.4,145368,360,1008
2019-05-16 00:00:00+00:00,2050.0,143349,355,1208
2019-05-19 00:00:00+00:00,2366.5,145368,360,1440
2019-05-22 00:00:00+00:00,2847.7,145368,360,1656
2019-05-25 00:00:00+00:00,1611.1,139311,345,1797
2019-05-28 00:00:00+00:00,1463.8,145368,360,2088
2019-05-31 00:00:00+00:00,2021.5,139311,390,813
2019-06-03 00:00:00+00:00,2238.0,145368,432,288


In [19]:
# Upsampling - increasing the frequency
# downsampling - decreasing the frequency (like the example above)

# Upsampling example 
thirty_sec = paris_df.resample('30s').sum(numeric_only=True)
thirty_sec

Unnamed: 0_level_0,value,Year,Month,Day
date.utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-05-07 01:00:00+00:00,25.0,2019,5,7
2019-05-07 01:00:30+00:00,0.0,0,0,0
2019-05-07 01:01:00+00:00,0.0,0,0,0
2019-05-07 01:01:30+00:00,0.0,0,0,0
2019-05-07 01:02:00+00:00,0.0,0,0,0
...,...,...,...,...
2019-06-20 23:58:00+00:00,0.0,0,0,0
2019-06-20 23:58:30+00:00,0.0,0,0,0
2019-06-20 23:59:00+00:00,0.0,0,0,0
2019-06-20 23:59:30+00:00,0.0,0,0,0


In [26]:
# We can also forwardfill and backfill
thirty_sec = paris_df.resample('30s').sum(numeric_only=True)
thirty_sec.head()

Unnamed: 0_level_0,value,Year,Month,Day
date.utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-05-07 01:00:00+00:00,25.0,2019,5,7
2019-05-07 01:00:30+00:00,0.0,0,0,0
2019-05-07 01:01:00+00:00,0.0,0,0,0
2019-05-07 01:01:30+00:00,0.0,0,0,0
2019-05-07 01:02:00+00:00,0.0,0,0,0


In [None]:
# Rolling window
paris_df['3 day pollution'] = paris_df['value'].rolling('3D').mean()
paris_df.head()