**imports**

In [1]:
import pandas as pd
import os
import numpy as np 
from pathlib import Path
import geopandas as gpd
import matplotlib.pyplot as plt

**Creating Lag and other TS Features**

In [2]:
DATA_PATH = Path.cwd().parent / "data"
RAW = DATA_PATH / "raw"
INTERIM = DATA_PATH / "interim"
processed= DATA_PATH / "processed"

In [3]:
#loading dataset
train = pd.read_csv(processed / "train.csv")
test = pd.read_csv(processed / "test.csv")



In [4]:
train.head()

Unnamed: 0,no2,trop_no2,trop_no2_cs,col_no2,col_no2_cs,null_fields,datetime,Delhi,Los Angeles (SoCAB),year,cos_month,sin_month
0,8.695,4.481723,4.481723,6.237321,6.237321,0,2019-01-01 23:59:59+00:00,0,1,1,0.866025,0.5
1,10.496667,5.436216,5.436216,7.206068,7.206068,0,2019-01-01 23:59:59+00:00,0,1,1,0.866025,0.5
2,37.208333,6.966879,6.966879,8.714388,8.714388,0,2019-01-01 23:59:59+00:00,0,1,1,0.866025,0.5
3,9.791667,6.361926,6.361926,8.107447,8.107447,0,2019-01-01 23:59:59+00:00,0,1,1,0.866025,0.5
4,4.308333,2.464319,2.464319,4.228561,4.228561,0,2019-01-01 23:59:59+00:00,0,1,1,0.866025,0.5


In [27]:
test.head()

Unnamed: 0,trop_no2,trop_no2_cs,col_no2,col_no2_cs,null_fields,datetime,Delhi,Los Angeles (SoCAB),year,cos_month,sin_month
0,1.304367,2.976798,3.867727,2.517487,1,2018-09-08 23:59:59+00:00,1,0,0,-1.83697e-16,-1.0
1,2.976798,2.976798,5.6305,5.6305,0,2021-03-25 23:59:59+00:00,1,0,3,6.123234000000001e-17,1.0
2,2.976798,2.976798,5.6305,5.6305,0,2021-03-25 23:59:59+00:00,1,0,3,6.123234000000001e-17,1.0
3,2.976798,2.976798,5.6305,5.6305,0,2021-03-25 23:59:59+00:00,1,0,3,6.123234000000001e-17,1.0
4,2.976798,2.976798,5.6305,5.6305,0,2021-03-25 23:59:59+00:00,1,0,3,6.123234000000001e-17,1.0


In [28]:
feature_columns = train.columns[train.columns.str.contains('o2')].tolist()
feature_columns.remove('no2')

**check the datetime range**

In [29]:
train['datetime'].min(),train['datetime'].max()

('2019-01-01 23:59:59+00:00', '2020-10-31 23:59:59+00:00')

In [30]:
test['datetime'].min(),test['datetime'].max()

('2018-09-08 23:59:59+00:00', '2021-08-24 23:59:59+00:00')

**creating trend**

In [31]:
 
all_dates = pd.concat([train[['datetime']],test[['datetime']]],axis=0,
                             ignore_index=True)
#take unique dates
all_dates.drop_duplicates(inplace=True)

#sort values
all_dates.sort_values('datetime',inplace=True)
#reset index 
all_dates.reset_index(drop=True,inplace=True)

#set trend
all_dates.reset_index(inplace=True)

all_dates.rename(columns = {'index':'trend'},
                 inplace=True)

all_dates.shape

(884, 2)

In [32]:
all_dates.head()

Unnamed: 0,trend,datetime
0,0,2018-09-08 23:59:59+00:00
1,1,2018-09-09 23:59:59+00:00
2,2,2018-09-10 23:59:59+00:00
3,3,2018-09-11 23:59:59+00:00
4,4,2018-09-12 23:59:59+00:00


Adding trend feature to dataframes

In [33]:
train = train.merge(all_dates,
                    on = 'datetime',
                    how='left')
test = test.merge(all_dates,
                  on = 'datetime',
                  how='left')

**Getting Lag features for all of the feature columns**

In [34]:
def get_lag(df,
            features,
            lag_shift=1):
    '''Get lag features for given list fo features in df'''
    df_cp = df.copy(deep=True)
    for feature in features:
        f_name = f'{feature}_lag_{lag_shift}'
        df_cp[f'{f_name}'] = df_cp[feature].shift(lag_shift)
        
    return df_cp
        
        
train_check = get_lag(train,features=feature_columns)
train_check.head()

Unnamed: 0,no2,trop_no2,trop_no2_cs,col_no2,col_no2_cs,null_fields,datetime,Delhi,Los Angeles (SoCAB),year,cos_month,sin_month,trend,trop_no2_lag_1,trop_no2_cs_lag_1,col_no2_lag_1,col_no2_cs_lag_1
0,8.695,4.481723,4.481723,6.237321,6.237321,0,2019-01-01 23:59:59+00:00,0,1,1,0.866025,0.5,90,,,,
1,10.496667,5.436216,5.436216,7.206068,7.206068,0,2019-01-01 23:59:59+00:00,0,1,1,0.866025,0.5,90,4.481723,4.481723,6.237321,6.237321
2,37.208333,6.966879,6.966879,8.714388,8.714388,0,2019-01-01 23:59:59+00:00,0,1,1,0.866025,0.5,90,5.436216,5.436216,7.206068,7.206068
3,9.791667,6.361926,6.361926,8.107447,8.107447,0,2019-01-01 23:59:59+00:00,0,1,1,0.866025,0.5,90,6.966879,6.966879,8.714388,8.714388
4,4.308333,2.464319,2.464319,4.228561,4.228561,0,2019-01-01 23:59:59+00:00,0,1,1,0.866025,0.5,90,6.361926,6.361926,8.107447,8.107447


In [35]:
num_lags = 7 


tr_lag = train.copy()
ts_lag = test.copy()
for lag in range(1,num_lags+1):
    
    #seperate df for each city 
    
    #train
    tr_dl = tr_lag[tr_lag['Delhi']==1]
    tr_nt_dl = tr_lag[tr_lag['Delhi']!=1]
    
    # test
    ts_dl = ts_lag[ts_lag['Delhi']==1]
    ts_nt_dl = ts_lag[ts_lag['Delhi']!=1]
    
    
    
    #calc lags 
    tr_dl = get_lag(tr_dl,features=feature_columns,lag_shift=lag)
    tr_nt_dl = get_lag(tr_nt_dl,features=feature_columns,lag_shift=lag)
    
    tr_lag = pd.concat([tr_dl,tr_nt_dl],axis=0,ignore_index=True)
    
    
    ts_dl = get_lag(ts_dl,features=feature_columns,lag_shift=lag)
    ts_nt_dl = get_lag(ts_nt_dl,features=feature_columns,lag_shift=lag)
    
    ts_lag = pd.concat([ts_dl,ts_nt_dl],axis=0,ignore_index=True)

    
    
    

tr_lag.head()

Unnamed: 0,no2,trop_no2,trop_no2_cs,col_no2,col_no2_cs,null_fields,datetime,Delhi,Los Angeles (SoCAB),year,...,col_no2_lag_5,col_no2_cs_lag_5,trop_no2_lag_6,trop_no2_cs_lag_6,col_no2_lag_6,col_no2_cs_lag_6,trop_no2_lag_7,trop_no2_cs_lag_7,col_no2_lag_7,col_no2_cs_lag_7
0,80.5,10.310496,10.310496,12.418408,12.418408,0,2019-01-01 23:59:59+00:00,1,0,1,...,,,,,,,,,,
1,58.0,11.751335,11.751335,13.948538,13.948538,0,2019-01-17 23:59:59+00:00,1,0,1,...,,,,,,,,,,
2,17.931081,22.962611,21.946426,25.312905,24.295707,0,2019-02-11 23:59:59+00:00,1,0,1,...,,,,,,,,,,
3,47.286667,22.962611,21.946426,25.312905,24.295707,0,2019-02-11 23:59:59+00:00,1,0,1,...,,,,,,,,,,
4,77.415517,22.962611,21.946426,25.312905,24.295707,0,2019-02-11 23:59:59+00:00,1,0,1,...,,,,,,,,,,


In [36]:
tr_lag.shape

(21320, 41)

In [37]:
ts_lag.shape

(15884, 40)

In [38]:
processed

PosixPath('/home/jovyan/PlanetaryComputerExamples/my_competitions/airathon_1/data/processed')

**saving for modelling**

In [44]:
tr_lag.columns

Index(['no2', 'trop_no2', 'trop_no2_cs', 'col_no2', 'col_no2_cs',
       'null_fields', 'datetime', 'Delhi', 'Los Angeles (SoCAB)', 'year',
       'cos_month', 'sin_month', 'trend', 'trop_no2_lag_1',
       'trop_no2_cs_lag_1', 'col_no2_lag_1', 'col_no2_cs_lag_1',
       'trop_no2_lag_2', 'trop_no2_cs_lag_2', 'col_no2_lag_2',
       'col_no2_cs_lag_2', 'trop_no2_lag_3', 'trop_no2_cs_lag_3',
       'col_no2_lag_3', 'col_no2_cs_lag_3', 'trop_no2_lag_4',
       'trop_no2_cs_lag_4', 'col_no2_lag_4', 'col_no2_cs_lag_4',
       'trop_no2_lag_5', 'trop_no2_cs_lag_5', 'col_no2_lag_5',
       'col_no2_cs_lag_5', 'trop_no2_lag_6', 'trop_no2_cs_lag_6',
       'col_no2_lag_6', 'col_no2_cs_lag_6', 'trop_no2_lag_7',
       'trop_no2_cs_lag_7', 'col_no2_lag_7', 'col_no2_cs_lag_7'],
      dtype='object')

In [45]:
tr_lag.drop('datetime',inplace=True,axis=1)
ts_lag.drop('datetime',inplace=True,axis=1)

In [46]:
tr_lag.to_csv(processed / 'train_lag.csv',index=False)
ts_lag.to_csv(processed / 'test_lag.csv',index=False)