In [28]:
from pathlib import Path
from datetime import datetime
from os import PathLike
from typing import Union

# data manipulation and analysis
import numpy as np 
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates
from matplotlib.dates import date2num
import seaborn as sns
from prettytable import PrettyTable
import plotly.express as px # An interactive graphing library that makes interactive, publication-quality graphs online.
import plotly.graph_objs as go # An interactive graphing library that makes interactive, publication-quality graphs online.

# ML model selection and evaluation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler

pd.set_option("display.max_rows", None, "display.max_columns", None)

In [29]:
imputed_demand_dir = '/Users/yashwanthkaruparthi/Developer/energy_demand/research/data/3-imputed/demand'
imputed_demand_path = Path(imputed_demand_dir)

calculated_features_dir = '/Users/yashwanthkaruparthi/Developer/energy_demand/research/data/4-calculated-features'
calculated_features_path = Path(calculated_features_dir)

In [4]:
print(imputed_demand_path)

/Users/yashwanthkaruparthi/Developer/energy_demand/research/data/3-imputed/demand


In [5]:
demand_df = pd.read_csv(imputed_demand_path/'demand.csv')

In [6]:
demand_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 222840 entries, 0 to 222839
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  222840 non-null  object 
 1   ont_demand  222840 non-null  float64
dtypes: float64(1), object(1)
memory usage: 3.4+ MB


In [8]:
features_df = demand_df.copy(deep=True)
features_df.head()

Unnamed: 0.1,Unnamed: 0,ont_demand
0,1994-01-01 00:00:00,14422.0
1,1994-01-01 01:00:00,13845.0
2,1994-01-01 02:00:00,13372.0
3,1994-01-01 03:00:00,13025.0
4,1994-01-01 04:00:00,12869.0


In [10]:
features_df['Unnamed: 0'] = pd.to_datetime(features_df['Unnamed: 0'])

In [11]:
features_df.set_index('Unnamed: 0', inplace=True)

In [13]:
features_df['hour_of_day'] = features_df.index.hour
features_df['year'] = features_df.index.year
features_df['month'] = features_df.index.month
features_df['day_of_week'] = features_df.index.dayofweek
features_df['day_of_year'] = features_df.index.dayofyear
# features_df['week_of_year'] = features_df.index.weekofyear
features_df['week_of_year'] = features_df.index.isocalendar().week
features_df['quarter'] = features_df.index.quarter
features_df.drop(columns=['ont_demand'], inplace=True)

features_df.head()

Unnamed: 0_level_0,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1994-01-01 00:00:00,0,1994,1,5,1,52,1
1994-01-01 01:00:00,1,1994,1,5,1,52,1
1994-01-01 02:00:00,2,1994,1,5,1,52,1
1994-01-01 03:00:00,3,1994,1,5,1,52,1
1994-01-01 04:00:00,4,1994,1,5,1,52,1


In [14]:
import holidays


In [15]:
hols = holidays.Canada(state='ON') # default is ontario Holidays
print(features_df.loc['2018-01-01'].index.date[0] in hols)
print(features_df.loc['2018-12-27'].index.date[0] in hols)

True
False


In [17]:
# adding column if each day is holiday or not
features_df['stat_hol'] = pd.Series(features_df.index.date).apply(lambda x: x in hols).values
features_df.head()

Unnamed: 0_level_0,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,stat_hol
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1994-01-01 00:00:00,0,1994,1,5,1,52,1,True
1994-01-01 01:00:00,1,1994,1,5,1,52,1,True
1994-01-01 02:00:00,2,1994,1,5,1,52,1,True
1994-01-01 03:00:00,3,1994,1,5,1,52,1,True
1994-01-01 04:00:00,4,1994,1,5,1,52,1,True


In [18]:
features_df.tail()

Unnamed: 0_level_0,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,stat_hol
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-06-03 19:00:00,19,2019,6,0,154,23,2,False
2019-06-03 20:00:00,20,2019,6,0,154,23,2,False
2019-06-03 21:00:00,21,2019,6,0,154,23,2,False
2019-06-03 22:00:00,22,2019,6,0,154,23,2,False
2019-06-03 23:00:00,23,2019,6,0,154,23,2,False


In [21]:
from astral import Astral
import datetime
a = Astral()
city_name='Toronto'
city = a[city_name]
#city.latitude
date = datetime.date(2019, 7, 2)
sun = city.sun(date=date, local = True)

print(sun['sunrise'])
print(sun['sunset'])
print(type(sun['sunrise']))

2019-07-02 05:39:03-04:00
2019-07-02 21:03:43-04:00
<class 'datetime.datetime'>


In [22]:
print(features_df.loc['2018-01-01'].index[0])
print(features_df.loc['2018-12-27'].index[0])

2018-01-01 00:00:00
2018-12-27 00:00:00


In [23]:
features_df.head()

Unnamed: 0_level_0,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,stat_hol
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1994-01-01 00:00:00,0,1994,1,5,1,52,1,True
1994-01-01 01:00:00,1,1994,1,5,1,52,1,True
1994-01-01 02:00:00,2,1994,1,5,1,52,1,True
1994-01-01 03:00:00,3,1994,1,5,1,52,1,True
1994-01-01 04:00:00,4,1994,1,5,1,52,1,True


In [24]:
def get_daylight_hours(row, city):
    sun = city.sun(date=row.name, local=True)
    sunrise = sun['sunrise'].replace(tzinfo=None) ; sunset = sun['sunset'].replace(tzinfo=None)
    bool_val = (row.name > sunrise) & (row.name < sunset)
    return bool_val


a = Astral()
city = a['Toronto']
features_df['day_light_hours'] = features_df.apply(get_daylight_hours, city=city, axis=1)
features_df.head()

Unnamed: 0_level_0,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,stat_hol,day_light_hours
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1994-01-01 00:00:00,0,1994,1,5,1,52,1,True,False
1994-01-01 01:00:00,1,1994,1,5,1,52,1,True,False
1994-01-01 02:00:00,2,1994,1,5,1,52,1,True,False
1994-01-01 03:00:00,3,1994,1,5,1,52,1,True,False
1994-01-01 04:00:00,4,1994,1,5,1,52,1,True,False


In [25]:
features_df.tail()

Unnamed: 0_level_0,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,stat_hol,day_light_hours
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2019-06-03 19:00:00,19,2019,6,0,154,23,2,False,True
2019-06-03 20:00:00,20,2019,6,0,154,23,2,False,True
2019-06-03 21:00:00,21,2019,6,0,154,23,2,False,False
2019-06-03 22:00:00,22,2019,6,0,154,23,2,False,False
2019-06-03 23:00:00,23,2019,6,0,154,23,2,False,False


In [26]:
features_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 222840 entries, 1994-01-01 00:00:00 to 2019-06-03 23:00:00
Data columns (total 9 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   hour_of_day      222840 non-null  int32 
 1   year             222840 non-null  int32 
 2   month            222840 non-null  int32 
 3   day_of_week      222840 non-null  int32 
 4   day_of_year      222840 non-null  int32 
 5   week_of_year     222840 non-null  UInt32
 6   quarter          222840 non-null  int32 
 7   stat_hol         222840 non-null  bool  
 8   day_light_hours  222840 non-null  bool  
dtypes: UInt32(1), bool(2), int32(6)
memory usage: 8.3 MB


In [30]:
features_df.to_csv(calculated_features_path / 'calendar.csv')