**imports**

In [1]:
import pandas as pd
import os
import numpy as np 
from pathlib import Path
import geopandas as gpd
import matplotlib.pyplot as plt

**Creating Features**

In [2]:
DATA_PATH = Path.cwd().parent / "data"
RAW = DATA_PATH / "raw"
INTERIM = DATA_PATH / "interim"
processed= DATA_PATH / "processed"

In [3]:
train_labels = pd.read_csv(RAW/'train_labels.csv', parse_dates=["datetime"])
train_labels.rename(columns={"value": "no2"}, inplace=True)

train_labels.head(2)

Unnamed: 0,datetime,grid_id,no2
0,2019-01-01 08:00:00+00:00,3A3IE,8.695
1,2019-01-01 08:00:00+00:00,3S31A,10.496667


In [4]:
#loading preprocessed data
train_df = pd.read_csv(INTERIM / 'train_df.csv')
test_df = pd.read_csv(INTERIM / 'test_df.csv')

In [5]:
test_df.shape

(15884, 9)

In [6]:
train_df.shape

(26403, 9)

In [7]:
# def feature_agg(df,
#                 label_df = None,  # provide if train, none if test
#                 agg_function_list = ['mean','median','min','max','std','skew','sem'],
#                 columns_list = ['trop_no2','trop_no2_cs','col_no2','col_no2_cs']
#                ):
    
#     #copy of features df 
#     feature_df = df.copy()
     
#     # Add `day` column to `feature_df` and `label_df`
#     feature_df["datetime"] = pd.to_datetime(
#         feature_df.granule_id.str.split("_", expand=True)[0],
#         format="%Y%m%dT%H:%M:%S",
#         utc=True
#     )
        
#     feature_df["day"] = feature_df.datetime.dt.date    
    
#     #creating a groupby obj 
#     day_wise_groups = feature_df.groupby(["grid_id","day"])
    
#     dates = feature_df.datetime
#     #get location 
#     locs = day_wise_groups['location'].first()
    
#     #aggregate 
#     for i,col in enumerate(columns_list):
#         col_aggs = day_wise_groups[col].agg(agg_function_list).\
#                     rename(columns={func:f'{col}-{func}' for func in agg_function_list})
        
#         if i == 0:
#             agg_features =  col_aggs
        
#         else:
#             agg_features = agg_features.merge(col_aggs,
#                                               how='inner',
#                                               right_index=True,
#                                               left_index = True)
            
    
#     agg_features['location'] = locs
    
#     #other aggs
#     agg_features['nulls'] = day_wise_groups['null_fields'].sum()
#     agg_features['counts'] = day_wise_groups.size()
    
#     agg_features['datetime'] = feature_df.datetime
#     agg_features.reset_index(inplace=True)
#     agg_features.drop('day',axis = 1 ,inplace=True)
#     return agg_features
    

In [7]:
def get_features_df(df,
                    labels_df = None,
                    mode = 'train',
                    ): 
    #copy of df
    feature_df = df.copy()
    if mode =='train':
        label_df = labels_df.copy()
        label_df["day"] = label_df.datetime.dt.date
    
    
    feature_df["datetime"] = pd.to_datetime(
        feature_df.granule_id.str.split("_", expand=True)[0],
        format="%Y%m%dT%H:%M:%S",
        utc=True
    )
    feature_df["day"] = feature_df.datetime.dt.date
    
    #one hot encoding locations
    location_ohe = pd.get_dummies(feature_df['location'],
                                  drop_first = False)
    
    feature_df.drop('location',axis=1,inplace=True)
    feature_df = feature_df.merge(location_ohe,
                            how='inner',
                            right_index = True,
                            left_index  = True)
    
    if mode == 'train':
        
        # Join labels/submission format with feature data
        how = "inner" if mode == "train" else "left"
        data = pd.merge(
            label_df[['grid_id','day','no2']],
            feature_df,
            how=how,
            left_on=["day", "grid_id"],
            right_on=["day", "grid_id"]
                        )

        return data
    
    return feature_df


In [8]:
train = get_features_df(df=train_df,
                    labels_df =train_labels)
    
test = get_features_df(df=test_df,
                    mode = 'test')
    

In [9]:
def time_features(df,col):
    '''extract features from time_stamps'''
    df['year']=df[col].dt.year - 2018
    df['month']=df[col].dt.month
    # df['hour']=df[col].dt.hour
    
    # df.drop(col,inplace=True,axis=1)
    return df

train=time_features(train,'datetime')
test=time_features(test,'datetime')

def time_encodings(df):
    df['cos_'+'month']=np.cos(2.*np.pi*df['month']/12)
    df['sin_'+'month']=np.sin(2.*np.pi*df['month']/12)
    
    # df['cos_'+'hour']=np.cos(2.*np.pi*df['hour']/12)
    # df['sin_'+'hour']=np.sin(2.*np.pi*df['hour']/12)
    
    df.drop(['month'],axis=1,inplace=True)
    
    return df 
    
train=time_encodings(train)
test=time_encodings(test)

In [10]:
#reset index

train.reset_index(drop=True,inplace=True)
test.reset_index(drop=True,inplace=True)

In [11]:
#saving train and test ids 

test[['grid_id','datetime']].to_csv(processed / "test_ids.csv",index=False)
train[['grid_id','datetime']].to_csv(processed / "train_ids.csv",index=False)


In [12]:
train.shape

(21320, 16)

In [13]:
test.shape

(15884, 15)

In [14]:
drop_cols = ['day','geometry','granule_id','grid_id']
target = ['no2']

In [15]:
train = train.drop(drop_cols,axis=1)
test = test.drop(drop_cols,axis=1)

In [16]:
set(train.columns) - set(test.columns)
# assert len(set(train.columns) - set(test.columns)) ==0 , 'feels like Im knocking on heavens door'

{'no2'}

In [17]:
#saving train /test set

train.to_csv(processed / "train.csv",index=False)
test.to_csv(processed / "test.csv",index=False)