In [1]:
from pathlib import Path
from datetime import datetime
from os import PathLike
from typing import Union
import dateutil
import math

# data manipulation and analysis
import numpy as np 
import pandas as pd
from astral import Astral

# Visualization
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates
from matplotlib.dates import date2num
import seaborn as sns
from prettytable import PrettyTable
import plotly.express as px # An interactive graphing library that makes interactive, publication-quality graphs online.
import plotly.graph_objs as go # An interactive graphing library that makes interactive, publication-quality graphs online.

# ML model selection and evaluation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler

pd.set_option("display.max_rows", None, "display.max_columns", None)

In [2]:
imputed_demand_dir = '/Users/yashwanthkaruparthi/Developer/energy_demand/research/data/3-imputed/demand'
imputed_demand_path = Path(imputed_demand_dir)

imputed_weather_dir = '/Users/yashwanthkaruparthi/Developer/energy_demand/research/data/3-imputed/weather-toronto'
imputed_weather_path = Path(imputed_weather_dir)

calculated_features_dir = '/Users/yashwanthkaruparthi/Developer/energy_demand/research/data/4-calculated-features'
calculated_features_path = Path(calculated_features_dir)

interim_data_dir = '/Users/yashwanthkaruparthi/Developer/energy_demand/research/data/5-interim'
interim_data_path = Path(interim_data_dir)

clean_data_dir = '/Users/yashwanthkaruparthi/Developer/energy_demand/research/data/6-clean'
clean_data_path = Path(clean_data_dir)

In [3]:
df_orig = pd.read_csv('/Users/yashwanthkaruparthi/Developer/energy_demand/research/data/6-clean/clean-cut.csv')

In [4]:
df_orig.head()

Unnamed: 0.1,Unnamed: 0,temp,dew_point_temp,rel_hum,wind_speed,visibility,press,hmdx,wind_chill,weather,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,stat_hol,day_light_hours,hourly_demand,daily_peak
0,1994-01-01 00:00:00,-1.2,-3.8,83.0,15.0,19.3,99.91,,-6.0,Cloudy,0.0,1994.0,1.0,5.0,1.0,52.0,1.0,True,False,14422.0,16892.0
1,1994-01-01 01:00:00,-0.9,-3.0,86.0,20.0,16.1,99.91,,-6.0,Cloudy,1.0,1994.0,1.0,5.0,1.0,52.0,1.0,True,False,13845.0,16892.0
2,1994-01-01 02:00:00,-0.7,-3.2,83.0,15.0,16.1,99.87,,-5.0,Cloudy,2.0,1994.0,1.0,5.0,1.0,52.0,1.0,True,False,13372.0,16892.0
3,1994-01-01 03:00:00,-0.8,-2.4,89.0,15.0,12.9,99.81,,-5.0,Cloudy,3.0,1994.0,1.0,5.0,1.0,52.0,1.0,True,False,13025.0,16892.0
4,1994-01-01 04:00:00,-1.0,-3.3,84.0,19.0,16.1,99.77,,-6.0,Mostly Cloudy,4.0,1994.0,1.0,5.0,1.0,52.0,1.0,True,False,12869.0,16892.0


In [5]:
df_orig['Unnamed: 0'] = pd.to_datetime(df_orig['Unnamed: 0'])

In [6]:
dtypes = {'temp': np.float64, 'dew_point_temp':np.float64, 'rel_hum':np.float64,
          'wind_speed': np.float64, 'visibility': np.float64, 'press': np.float64,
          'hmdx': np.float64, 'wind_chill': np.float64, 'weather': object,
         'hour_of_day': np.float64, 'year': np.float64, 'month': np.float64,
         'day_of_week': np.float64, 'day_of_year': np.float64, 'week_of_year': np.float64,
          'quarter': np.float64, 'stat_hol': 'category', 'day_light_hours': 'category',
          'hourly_demand': np.float64, 'daily_peak': np.float64}

daymapper = {0.0: 'Mon', 1.0: 'Tue', 2.0: 'Wed', 3.0: 'Thu', 4.0: 'Fri', 5.0: 'Sat', 6.0: 'Sun'}
weekdaymapper = {0.0: 'Weekday', 1.0: 'Weekday', 2.0: 'Weekday', 3.0: 'Weekday', 4.0: 'Weekday',
                 5.0: 'Weekend', 6.0: 'Weekend'}
df_orig['dayofweek'] = df_orig['day_of_week'].map(daymapper)
df_orig['daytype'] = df_orig['day_of_week'].map(weekdaymapper)
df_orig['stat_hol'] = df_orig['stat_hol'].map({'True':1, 'False':0})
df_orig['day_light_hours'] = df_orig['day_light_hours'].map({'True':1, 'False':0})
df_orig['sun'] = df_orig['day_light_hours'].diff().fillna(0)

In [7]:
df_orig.head()

Unnamed: 0.1,Unnamed: 0,temp,dew_point_temp,rel_hum,wind_speed,visibility,press,hmdx,wind_chill,weather,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,stat_hol,day_light_hours,hourly_demand,daily_peak,dayofweek,daytype,sun
0,1994-01-01 00:00:00,-1.2,-3.8,83.0,15.0,19.3,99.91,,-6.0,Cloudy,0.0,1994.0,1.0,5.0,1.0,52.0,1.0,,,14422.0,16892.0,Sat,Weekend,0.0
1,1994-01-01 01:00:00,-0.9,-3.0,86.0,20.0,16.1,99.91,,-6.0,Cloudy,1.0,1994.0,1.0,5.0,1.0,52.0,1.0,,,13845.0,16892.0,Sat,Weekend,0.0
2,1994-01-01 02:00:00,-0.7,-3.2,83.0,15.0,16.1,99.87,,-5.0,Cloudy,2.0,1994.0,1.0,5.0,1.0,52.0,1.0,,,13372.0,16892.0,Sat,Weekend,0.0
3,1994-01-01 03:00:00,-0.8,-2.4,89.0,15.0,12.9,99.81,,-5.0,Cloudy,3.0,1994.0,1.0,5.0,1.0,52.0,1.0,,,13025.0,16892.0,Sat,Weekend,0.0
4,1994-01-01 04:00:00,-1.0,-3.3,84.0,19.0,16.1,99.77,,-6.0,Mostly Cloudy,4.0,1994.0,1.0,5.0,1.0,52.0,1.0,,,12869.0,16892.0,Sat,Weekend,0.0


In [8]:
df_orig.tail()

Unnamed: 0.1,Unnamed: 0,temp,dew_point_temp,rel_hum,wind_speed,visibility,press,hmdx,wind_chill,weather,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,stat_hol,day_light_hours,hourly_demand,daily_peak,dayofweek,daytype,sun
219139,2018-12-31 19:00:00,2.5,1.0,90.0,19.0,9.7,98.51,,,"Rain,Fog",19.0,2018.0,12.0,0.0,365.0,1.0,4.0,,,16195.0,17125.0,Mon,Weekday,0.0
219140,2018-12-31 20:00:00,2.9,1.6,92.0,21.0,9.7,98.17,,,"Rain,Fog",20.0,2018.0,12.0,0.0,365.0,1.0,4.0,,,15668.0,17125.0,Mon,Weekday,0.0
219141,2018-12-31 21:00:00,3.7,2.6,93.0,21.0,9.7,97.98,,,"Rain,Fog",21.0,2018.0,12.0,0.0,365.0,1.0,4.0,,,14987.0,17125.0,Mon,Weekday,0.0
219142,2018-12-31 22:00:00,4.0,3.0,93.0,18.0,6.4,97.61,,,"Rain,Fog",22.0,2018.0,12.0,0.0,365.0,1.0,4.0,,,14560.0,17125.0,Mon,Weekday,0.0
219143,2018-12-31 23:00:00,4.3,3.4,94.0,12.0,9.7,97.37,,,"Rain,Fog",23.0,2018.0,12.0,0.0,365.0,1.0,4.0,,,13828.0,17125.0,Mon,Weekday,0.0


In [9]:
# df = df_orig.copy(deep=True)
df = df_orig[(df_orig['Unnamed: 0'] >= '1994-01-01') & (df_orig['Unnamed: 0'] < '2019-01-01')]
df.head()

Unnamed: 0.1,Unnamed: 0,temp,dew_point_temp,rel_hum,wind_speed,visibility,press,hmdx,wind_chill,weather,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,stat_hol,day_light_hours,hourly_demand,daily_peak,dayofweek,daytype,sun
0,1994-01-01 00:00:00,-1.2,-3.8,83.0,15.0,19.3,99.91,,-6.0,Cloudy,0.0,1994.0,1.0,5.0,1.0,52.0,1.0,,,14422.0,16892.0,Sat,Weekend,0.0
1,1994-01-01 01:00:00,-0.9,-3.0,86.0,20.0,16.1,99.91,,-6.0,Cloudy,1.0,1994.0,1.0,5.0,1.0,52.0,1.0,,,13845.0,16892.0,Sat,Weekend,0.0
2,1994-01-01 02:00:00,-0.7,-3.2,83.0,15.0,16.1,99.87,,-5.0,Cloudy,2.0,1994.0,1.0,5.0,1.0,52.0,1.0,,,13372.0,16892.0,Sat,Weekend,0.0
3,1994-01-01 03:00:00,-0.8,-2.4,89.0,15.0,12.9,99.81,,-5.0,Cloudy,3.0,1994.0,1.0,5.0,1.0,52.0,1.0,,,13025.0,16892.0,Sat,Weekend,0.0
4,1994-01-01 04:00:00,-1.0,-3.3,84.0,19.0,16.1,99.77,,-6.0,Mostly Cloudy,4.0,1994.0,1.0,5.0,1.0,52.0,1.0,,,12869.0,16892.0,Sat,Weekend,0.0


In [10]:
df.tail()

Unnamed: 0.1,Unnamed: 0,temp,dew_point_temp,rel_hum,wind_speed,visibility,press,hmdx,wind_chill,weather,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,stat_hol,day_light_hours,hourly_demand,daily_peak,dayofweek,daytype,sun
219139,2018-12-31 19:00:00,2.5,1.0,90.0,19.0,9.7,98.51,,,"Rain,Fog",19.0,2018.0,12.0,0.0,365.0,1.0,4.0,,,16195.0,17125.0,Mon,Weekday,0.0
219140,2018-12-31 20:00:00,2.9,1.6,92.0,21.0,9.7,98.17,,,"Rain,Fog",20.0,2018.0,12.0,0.0,365.0,1.0,4.0,,,15668.0,17125.0,Mon,Weekday,0.0
219141,2018-12-31 21:00:00,3.7,2.6,93.0,21.0,9.7,97.98,,,"Rain,Fog",21.0,2018.0,12.0,0.0,365.0,1.0,4.0,,,14987.0,17125.0,Mon,Weekday,0.0
219142,2018-12-31 22:00:00,4.0,3.0,93.0,18.0,6.4,97.61,,,"Rain,Fog",22.0,2018.0,12.0,0.0,365.0,1.0,4.0,,,14560.0,17125.0,Mon,Weekday,0.0
219143,2018-12-31 23:00:00,4.3,3.4,94.0,12.0,9.7,97.37,,,"Rain,Fog",23.0,2018.0,12.0,0.0,365.0,1.0,4.0,,,13828.0,17125.0,Mon,Weekday,0.0


In [11]:
df_orig = df_orig.drop('daily_peak', axis=1)

In [12]:
df = df_orig.copy(deep=True)

df['hmdxx'] = df['temp'] + (5/9)*(6.11*math.e**(5417.7530*(1/273.16 - 1/(273.15 + df['dew_point_temp'])))-10)
# df.loc['1997-07-01'][['hmdx', 'hmdxx']].head(15)
df.head()

Unnamed: 0.1,Unnamed: 0,temp,dew_point_temp,rel_hum,wind_speed,visibility,press,hmdx,wind_chill,weather,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,stat_hol,day_light_hours,hourly_demand,dayofweek,daytype,sun,hmdxx
0,1994-01-01 00:00:00,-1.2,-3.8,83.0,15.0,19.3,99.91,,-6.0,Cloudy,0.0,1994.0,1.0,5.0,1.0,52.0,1.0,,,14422.0,Sat,Weekend,0.0,-4.1915
1,1994-01-01 01:00:00,-0.9,-3.0,86.0,20.0,16.1,99.91,,-6.0,Cloudy,1.0,1994.0,1.0,5.0,1.0,52.0,1.0,,,13845.0,Sat,Weekend,0.0,-3.734133
2,1994-01-01 02:00:00,-0.7,-3.2,83.0,15.0,16.1,99.87,,-5.0,Cloudy,2.0,1994.0,1.0,5.0,1.0,52.0,1.0,,,13372.0,Sat,Weekend,0.0,-3.574269
3,1994-01-01 03:00:00,-0.8,-2.4,89.0,15.0,12.9,99.81,,-5.0,Cloudy,3.0,1994.0,1.0,5.0,1.0,52.0,1.0,,,13025.0,Sat,Weekend,0.0,-3.510459
4,1994-01-01 04:00:00,-1.0,-3.3,84.0,19.0,16.1,99.77,,-6.0,Mostly Cloudy,4.0,1994.0,1.0,5.0,1.0,52.0,1.0,,,12869.0,Sat,Weekend,0.0,-3.894137


In [13]:
features = ['temp', 'dew_point_temp', 'rel_hum', 'visibility', 'press', 'hmdxx']

# df_orig.set_index('Unnamed: 0', inplace=True)


keeper_cols = []
for feature in features:
    df = df.join(df.groupby(by=['year', 'day_of_year'])[feature].min(),
                           on=['year', 'day_of_year'], rsuffix='_min')
    keeper_cols.append(feature + '_min')
    df = df.join(df.groupby(by=['year', 'day_of_year'])[feature].max(),
                           on=['year', 'day_of_year'], rsuffix='_max')
    keeper_cols.append(feature + '_max')
    df = df.join(df.groupby(by=['year', 'day_of_year'])[feature].mean(),
                           on=['year', 'day_of_year'], rsuffix='_mean')
    keeper_cols.append(feature + '_mean')
    df = df.join(df.groupby(by=['year', 'day_of_year'])[feature].median(),
                           on=['year', 'day_of_year'], rsuffix='_median')
    keeper_cols.append(feature + '_median')

    # max_hours = df.loc[df.groupby(pd.Grouper(freq='D')).idxmax().loc[:, feature]]['hour_of_day'].values
    # df[feature + '_max_hour'] = np.repeat(max_hours, 24)
    # keeper_cols.append(feature + '_max_hour')

    # min_hours = df.loc[df.groupby(pd.Grouper(freq='D')).idxmin().loc[:, feature]]['hour_of_day'].values
    # df[feature + '_min_hour'] = np.repeat(min_hours, 24)
    # keeper_cols.append(feature + '_min_hour')

    # Calculate the hour of max value
    max_hours = df.loc[df.groupby(['year', 'day_of_year'])[feature].idxmax()]['hour_of_day'].values
    df[feature + '_max_hour'] = np.repeat(max_hours, 24)
    keeper_cols.append(feature + '_max_hour')

    # Calculate the hour of min value
    min_hours = df.loc[df.groupby(['year', 'day_of_year'])[feature].idxmin()]['hour_of_day'].values
    df[feature + '_min_hour'] = np.repeat(min_hours, 24)
    keeper_cols.append(feature + '_min_hour')



df.head()

Unnamed: 0.1,Unnamed: 0,temp,dew_point_temp,rel_hum,wind_speed,visibility,press,hmdx,wind_chill,weather,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,stat_hol,day_light_hours,hourly_demand,dayofweek,daytype,sun,hmdxx,temp_min,temp_max,temp_mean,temp_median,temp_max_hour,temp_min_hour,dew_point_temp_min,dew_point_temp_max,dew_point_temp_mean,dew_point_temp_median,dew_point_temp_max_hour,dew_point_temp_min_hour,rel_hum_min,rel_hum_max,rel_hum_mean,rel_hum_median,rel_hum_max_hour,rel_hum_min_hour,visibility_min,visibility_max,visibility_mean,visibility_median,visibility_max_hour,visibility_min_hour,press_min,press_max,press_mean,press_median,press_max_hour,press_min_hour,hmdxx_min,hmdxx_max,hmdxx_mean,hmdxx_median,hmdxx_max_hour,hmdxx_min_hour
0,1994-01-01 00:00:00,-1.2,-3.8,83.0,15.0,19.3,99.91,,-6.0,Cloudy,0.0,1994.0,1.0,5.0,1.0,52.0,1.0,,,14422.0,Sat,Weekend,0.0,-4.1915,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,19.0,7.0,73.0,99.0,87.0,86.5,14.0,9.0,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,0.0,16.0,-4.976663,0.686009,-1.8849,-1.513575,13.0,7.0
1,1994-01-01 01:00:00,-0.9,-3.0,86.0,20.0,16.1,99.91,,-6.0,Cloudy,1.0,1994.0,1.0,5.0,1.0,52.0,1.0,,,13845.0,Sat,Weekend,0.0,-3.734133,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,19.0,7.0,73.0,99.0,87.0,86.5,14.0,9.0,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,0.0,16.0,-4.976663,0.686009,-1.8849,-1.513575,13.0,7.0
2,1994-01-01 02:00:00,-0.7,-3.2,83.0,15.0,16.1,99.87,,-5.0,Cloudy,2.0,1994.0,1.0,5.0,1.0,52.0,1.0,,,13372.0,Sat,Weekend,0.0,-3.574269,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,19.0,7.0,73.0,99.0,87.0,86.5,14.0,9.0,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,0.0,16.0,-4.976663,0.686009,-1.8849,-1.513575,13.0,7.0
3,1994-01-01 03:00:00,-0.8,-2.4,89.0,15.0,12.9,99.81,,-5.0,Cloudy,3.0,1994.0,1.0,5.0,1.0,52.0,1.0,,,13025.0,Sat,Weekend,0.0,-3.510459,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,19.0,7.0,73.0,99.0,87.0,86.5,14.0,9.0,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,0.0,16.0,-4.976663,0.686009,-1.8849,-1.513575,13.0,7.0
4,1994-01-01 04:00:00,-1.0,-3.3,84.0,19.0,16.1,99.77,,-6.0,Mostly Cloudy,4.0,1994.0,1.0,5.0,1.0,52.0,1.0,,,12869.0,Sat,Weekend,0.0,-3.894137,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,19.0,7.0,73.0,99.0,87.0,86.5,14.0,9.0,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,0.0,16.0,-4.976663,0.686009,-1.8849,-1.513575,13.0,7.0


In [14]:
df.shape

(219144, 60)

In [15]:
df.columns

Index(['Unnamed: 0', 'temp', 'dew_point_temp', 'rel_hum', 'wind_speed',
       'visibility', 'press', 'hmdx', 'wind_chill', 'weather', 'hour_of_day',
       'year', 'month', 'day_of_week', 'day_of_year', 'week_of_year',
       'quarter', 'stat_hol', 'day_light_hours', 'hourly_demand', 'dayofweek',
       'daytype', 'sun', 'hmdxx', 'temp_min', 'temp_max', 'temp_mean',
       'temp_median', 'temp_max_hour', 'temp_min_hour', 'dew_point_temp_min',
       'dew_point_temp_max', 'dew_point_temp_mean', 'dew_point_temp_median',
       'dew_point_temp_max_hour', 'dew_point_temp_min_hour', 'rel_hum_min',
       'rel_hum_max', 'rel_hum_mean', 'rel_hum_median', 'rel_hum_max_hour',
       'rel_hum_min_hour', 'visibility_min', 'visibility_max',
       'visibility_mean', 'visibility_median', 'visibility_max_hour',
       'visibility_min_hour', 'press_min', 'press_max', 'press_mean',
       'press_median', 'press_max_hour', 'press_min_hour', 'hmdxx_min',
       'hmdxx_max', 'hmdxx_mean', 'hmdxx_median'

In [16]:
drop_columns = ['dayofweek', 'daytype', 'sun', 'day_light_hours', 'hmdx', 'hmdxx_max_hour', 'hmdxx_min_hour', 'stat_hol', 'rel_hum_max_hour', 'rel_hum_min_hour', 'dew_point_temp_max_hour', 'dew_point_temp_min_hour', 'press_max_hour', 'press_min_hour']
df.drop(columns = drop_columns, inplace=True)

In [17]:
df.shape

(219144, 46)

In [18]:
df.rename(columns={'Unnamed: 0': 'datetime'}, inplace=True)

In [19]:
df.head()

Unnamed: 0,datetime,temp,dew_point_temp,rel_hum,wind_speed,visibility,press,wind_chill,weather,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,hourly_demand,hmdxx,temp_min,temp_max,temp_mean,temp_median,temp_max_hour,temp_min_hour,dew_point_temp_min,dew_point_temp_max,dew_point_temp_mean,dew_point_temp_median,rel_hum_min,rel_hum_max,rel_hum_mean,rel_hum_median,visibility_min,visibility_max,visibility_mean,visibility_median,visibility_max_hour,visibility_min_hour,press_min,press_max,press_mean,press_median,hmdxx_min,hmdxx_max,hmdxx_mean,hmdxx_median
0,1994-01-01 00:00:00,-1.2,-3.8,83.0,15.0,19.3,99.91,-6.0,Cloudy,0.0,1994.0,1.0,5.0,1.0,52.0,1.0,14422.0,-4.1915,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,73.0,99.0,87.0,86.5,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,-4.976663,0.686009,-1.8849,-1.513575
1,1994-01-01 01:00:00,-0.9,-3.0,86.0,20.0,16.1,99.91,-6.0,Cloudy,1.0,1994.0,1.0,5.0,1.0,52.0,1.0,13845.0,-3.734133,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,73.0,99.0,87.0,86.5,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,-4.976663,0.686009,-1.8849,-1.513575
2,1994-01-01 02:00:00,-0.7,-3.2,83.0,15.0,16.1,99.87,-5.0,Cloudy,2.0,1994.0,1.0,5.0,1.0,52.0,1.0,13372.0,-3.574269,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,73.0,99.0,87.0,86.5,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,-4.976663,0.686009,-1.8849,-1.513575
3,1994-01-01 03:00:00,-0.8,-2.4,89.0,15.0,12.9,99.81,-5.0,Cloudy,3.0,1994.0,1.0,5.0,1.0,52.0,1.0,13025.0,-3.510459,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,73.0,99.0,87.0,86.5,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,-4.976663,0.686009,-1.8849,-1.513575
4,1994-01-01 04:00:00,-1.0,-3.3,84.0,19.0,16.1,99.77,-6.0,Mostly Cloudy,4.0,1994.0,1.0,5.0,1.0,52.0,1.0,12869.0,-3.894137,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,73.0,99.0,87.0,86.5,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,-4.976663,0.686009,-1.8849,-1.513575


In [20]:
if df.index.name=='datetime':
  print('True')
else:
  df.set_index('datetime', inplace=True)
  print('datetime is now indexed')

datetime is now indexed


In [21]:
df.columns

Index(['temp', 'dew_point_temp', 'rel_hum', 'wind_speed', 'visibility',
       'press', 'wind_chill', 'weather', 'hour_of_day', 'year', 'month',
       'day_of_week', 'day_of_year', 'week_of_year', 'quarter',
       'hourly_demand', 'hmdxx', 'temp_min', 'temp_max', 'temp_mean',
       'temp_median', 'temp_max_hour', 'temp_min_hour', 'dew_point_temp_min',
       'dew_point_temp_max', 'dew_point_temp_mean', 'dew_point_temp_median',
       'rel_hum_min', 'rel_hum_max', 'rel_hum_mean', 'rel_hum_median',
       'visibility_min', 'visibility_max', 'visibility_mean',
       'visibility_median', 'visibility_max_hour', 'visibility_min_hour',
       'press_min', 'press_max', 'press_mean', 'press_median', 'hmdxx_min',
       'hmdxx_max', 'hmdxx_mean', 'hmdxx_median'],
      dtype='object')

In [22]:
duplicated_columns = df.columns[df.columns.duplicated()]

if duplicated_columns.empty:
    print("No duplicated columns found.")
else:
    print("Duplicated columns found:")
    print(duplicated_columns)

No duplicated columns found.


In [23]:
df.tail()

Unnamed: 0_level_0,temp,dew_point_temp,rel_hum,wind_speed,visibility,press,wind_chill,weather,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,hourly_demand,hmdxx,temp_min,temp_max,temp_mean,temp_median,temp_max_hour,temp_min_hour,dew_point_temp_min,dew_point_temp_max,dew_point_temp_mean,dew_point_temp_median,rel_hum_min,rel_hum_max,rel_hum_mean,rel_hum_median,visibility_min,visibility_max,visibility_mean,visibility_median,visibility_max_hour,visibility_min_hour,press_min,press_max,press_mean,press_median,hmdxx_min,hmdxx_max,hmdxx_mean,hmdxx_median
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1
2018-12-31 19:00:00,2.5,1.0,90.0,19.0,9.7,98.51,,"Rain,Fog",19.0,2018.0,12.0,0.0,365.0,1.0,4.0,16195.0,0.590925,-2.0,4.3,1.204167,2.25,23.0,5.0,-6.4,3.4,-1.645833,-1.3,65.0,94.0,81.708333,82.5,4.8,24.1,18.091667,24.1,0.0,17.0,97.37,100.03,99.3025,99.695,-5.229812,3.073132,-1.27833,0.062595
2018-12-31 20:00:00,2.9,1.6,92.0,21.0,9.7,98.17,,"Rain,Fog",20.0,2018.0,12.0,0.0,365.0,1.0,4.0,15668.0,1.151739,-2.0,4.3,1.204167,2.25,23.0,5.0,-6.4,3.4,-1.645833,-1.3,65.0,94.0,81.708333,82.5,4.8,24.1,18.091667,24.1,0.0,17.0,97.37,100.03,99.3025,99.695,-5.229812,3.073132,-1.27833,0.062595
2018-12-31 21:00:00,3.7,2.6,93.0,21.0,9.7,97.98,,"Rain,Fog",21.0,2018.0,12.0,0.0,365.0,1.0,4.0,14987.0,2.233969,-2.0,4.3,1.204167,2.25,23.0,5.0,-6.4,3.4,-1.645833,-1.3,65.0,94.0,81.708333,82.5,4.8,24.1,18.091667,24.1,0.0,17.0,97.37,100.03,99.3025,99.695,-5.229812,3.073132,-1.27833,0.062595
2018-12-31 22:00:00,4.0,3.0,93.0,18.0,6.4,97.61,,"Rain,Fog",22.0,2018.0,12.0,0.0,365.0,1.0,4.0,14560.0,2.652024,-2.0,4.3,1.204167,2.25,23.0,5.0,-6.4,3.4,-1.645833,-1.3,65.0,94.0,81.708333,82.5,4.8,24.1,18.091667,24.1,0.0,17.0,97.37,100.03,99.3025,99.695,-5.229812,3.073132,-1.27833,0.062595
2018-12-31 23:00:00,4.3,3.4,94.0,12.0,9.7,97.37,,"Rain,Fog",23.0,2018.0,12.0,0.0,365.0,1.0,4.0,13828.0,3.073132,-2.0,4.3,1.204167,2.25,23.0,5.0,-6.4,3.4,-1.645833,-1.3,65.0,94.0,81.708333,82.5,4.8,24.1,18.091667,24.1,0.0,17.0,97.37,100.03,99.3025,99.695,-5.229812,3.073132,-1.27833,0.062595


In [24]:
df.to_csv(clean_data_path/"for_todaily_conversion.csv")

In [26]:
clean_pre_df = df.copy(deep=True)

In [27]:
clean_pre_df.head()

Unnamed: 0_level_0,temp,dew_point_temp,rel_hum,wind_speed,visibility,press,wind_chill,weather,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,hourly_demand,hmdxx,temp_min,temp_max,temp_mean,temp_median,temp_max_hour,temp_min_hour,dew_point_temp_min,dew_point_temp_max,dew_point_temp_mean,dew_point_temp_median,rel_hum_min,rel_hum_max,rel_hum_mean,rel_hum_median,visibility_min,visibility_max,visibility_mean,visibility_median,visibility_max_hour,visibility_min_hour,press_min,press_max,press_mean,press_median,hmdxx_min,hmdxx_max,hmdxx_mean,hmdxx_median
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1
1994-01-01 00:00:00,-1.2,-3.8,83.0,15.0,19.3,99.91,-6.0,Cloudy,0.0,1994.0,1.0,5.0,1.0,52.0,1.0,14422.0,-4.1915,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,73.0,99.0,87.0,86.5,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,-4.976663,0.686009,-1.8849,-1.513575
1994-01-01 01:00:00,-0.9,-3.0,86.0,20.0,16.1,99.91,-6.0,Cloudy,1.0,1994.0,1.0,5.0,1.0,52.0,1.0,13845.0,-3.734133,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,73.0,99.0,87.0,86.5,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,-4.976663,0.686009,-1.8849,-1.513575
1994-01-01 02:00:00,-0.7,-3.2,83.0,15.0,16.1,99.87,-5.0,Cloudy,2.0,1994.0,1.0,5.0,1.0,52.0,1.0,13372.0,-3.574269,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,73.0,99.0,87.0,86.5,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,-4.976663,0.686009,-1.8849,-1.513575
1994-01-01 03:00:00,-0.8,-2.4,89.0,15.0,12.9,99.81,-5.0,Cloudy,3.0,1994.0,1.0,5.0,1.0,52.0,1.0,13025.0,-3.510459,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,73.0,99.0,87.0,86.5,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,-4.976663,0.686009,-1.8849,-1.513575
1994-01-01 04:00:00,-1.0,-3.3,84.0,19.0,16.1,99.77,-6.0,Mostly Cloudy,4.0,1994.0,1.0,5.0,1.0,52.0,1.0,12869.0,-3.894137,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,73.0,99.0,87.0,86.5,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,-4.976663,0.686009,-1.8849,-1.513575


For exporting hourly data, we drop weather columns related to min, max, mean, median

In [44]:
clean_pre_df.columns

Index(['temp', 'dew_point_temp', 'rel_hum', 'wind_speed', 'visibility',
       'press', 'wind_chill', 'weather', 'hour_of_day', 'year', 'month',
       'day_of_week', 'day_of_year', 'week_of_year', 'quarter',
       'hourly_demand', 'hmdxx', 'temp_min', 'temp_max', 'temp_mean',
       'temp_median', 'temp_max_hour', 'temp_min_hour', 'dew_point_temp_min',
       'dew_point_temp_max', 'dew_point_temp_mean', 'dew_point_temp_median',
       'rel_hum_min', 'rel_hum_max', 'rel_hum_mean', 'rel_hum_median',
       'visibility_min', 'visibility_max', 'visibility_mean',
       'visibility_median', 'visibility_max_hour', 'visibility_min_hour',
       'press_min', 'press_max', 'press_mean', 'press_median', 'hmdxx_min',
       'hmdxx_max', 'hmdxx_mean', 'hmdxx_median'],
      dtype='object')

In [46]:
drop_columns_2 = ['temp_min', 'temp_max', 'temp_mean',
       'temp_median', 'temp_max_hour', 'temp_min_hour', 'dew_point_temp_min',
       'dew_point_temp_max', 'dew_point_temp_mean', 'dew_point_temp_median',
       'rel_hum_min', 'rel_hum_max', 'rel_hum_mean', 'rel_hum_median',
       'visibility_min', 'visibility_max', 'visibility_mean',
       'visibility_median', 'visibility_max_hour', 'visibility_min_hour',
       'press_min', 'press_max', 'press_mean', 'press_median', 'hmdxx_min',
       'hmdxx_max', 'hmdxx_mean', 'hmdxx_median']

clean_pre_df.drop(columns = drop_columns_2, inplace=True)

In [47]:
clean_pre_df.shape

(219144, 17)

In [48]:
clean_pre_df.head(30)

Unnamed: 0_level_0,temp,dew_point_temp,rel_hum,wind_speed,visibility,press,wind_chill,weather,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,hourly_demand,hmdxx
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1994-01-01 00:00:00,-1.2,-3.8,83.0,15.0,19.3,99.91,-6.0,Cloudy,0.0,1994.0,1.0,5.0,1.0,52.0,1.0,14422.0,-4.1915
1994-01-01 01:00:00,-0.9,-3.0,86.0,20.0,16.1,99.91,-6.0,Cloudy,1.0,1994.0,1.0,5.0,1.0,52.0,1.0,13845.0,-3.734133
1994-01-01 02:00:00,-0.7,-3.2,83.0,15.0,16.1,99.87,-5.0,Cloudy,2.0,1994.0,1.0,5.0,1.0,52.0,1.0,13372.0,-3.574269
1994-01-01 03:00:00,-0.8,-2.4,89.0,15.0,12.9,99.81,-5.0,Cloudy,3.0,1994.0,1.0,5.0,1.0,52.0,1.0,13025.0,-3.510459
1994-01-01 04:00:00,-1.0,-3.3,84.0,19.0,16.1,99.77,-6.0,Mostly Cloudy,4.0,1994.0,1.0,5.0,1.0,52.0,1.0,12869.0,-3.894137
1994-01-01 05:00:00,-1.8,-3.6,87.0,11.0,16.1,99.67,-6.0,Cloudy,5.0,1994.0,1.0,5.0,1.0,52.0,1.0,12866.0,-4.752946
1994-01-01 06:00:00,-1.7,-3.9,85.0,15.0,19.3,99.57,-7.0,Cloudy,6.0,1994.0,1.0,5.0,1.0,52.0,1.0,13015.0,-4.710583
1994-01-01 07:00:00,-1.8,-4.8,80.0,15.0,16.1,99.37,-7.0,Mostly Cloudy,7.0,1994.0,1.0,5.0,1.0,52.0,1.0,13577.0,-4.976663
1994-01-01 08:00:00,-0.5,-3.8,78.0,17.0,25.0,99.31,-5.0,Mostly Cloudy,8.0,1994.0,1.0,5.0,1.0,52.0,1.0,13567.0,-3.4915
1994-01-01 09:00:00,0.5,-3.7,73.0,24.0,25.0,99.14,,Mostly Cloudy,9.0,1994.0,1.0,5.0,1.0,52.0,1.0,14130.0,-2.472288


In [49]:
object_cols = clean_pre_df.select_dtypes(include=['object']).columns

print("columns with object dtype: ")
for col in object_cols:
  print(col)

columns with object dtype: 
weather


In [50]:
clean_pre_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 219144 entries, 1994-01-01 00:00:00 to 2018-12-31 23:00:00
Data columns (total 17 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   temp            219144 non-null  float64
 1   dew_point_temp  219144 non-null  float64
 2   rel_hum         219144 non-null  float64
 3   wind_speed      219144 non-null  float64
 4   visibility      219144 non-null  float64
 5   press           219144 non-null  float64
 6   wind_chill      48918 non-null   float64
 7   weather         191347 non-null  object 
 8   hour_of_day     219144 non-null  float64
 9   year            219144 non-null  float64
 10  month           219144 non-null  float64
 11  day_of_week     219144 non-null  float64
 12  day_of_year     219144 non-null  float64
 13  week_of_year    219144 non-null  float64
 14  quarter         219144 non-null  float64
 15  hourly_demand   219144 non-null  float64
 16  hmdxx           219144

In [51]:
clean_pre_df['weather'].unique()

array(['Cloudy', 'Mostly Cloudy', 'Snow', 'Moderate Snow,Fog', 'Snow,Fog',
       'Rain,Snow', 'Rain', 'Mainly Clear', 'Snow Showers', 'Clear',
       'Snow,Blowing Snow', 'Snow Showers,Blowing Snow', 'Fog',
       'Fog,Blowing Snow', 'Snow Grains', 'Freezing Drizzle,Fog',
       'Drizzle,Fog', 'Drizzle', 'Haze', 'Freezing Rain,Fog', 'Rain,Fog',
       'Rain,Drizzle,Fog', nan, 'Ice Fog', 'Snow,Snow Grains',
       'Moderate Snow', 'Blowing Snow', 'Moderate Snow,Snow Grains',
       'Moderate Snow,Snow Grains,Blowing Snow',
       'Snow,Snow Grains,Blowing Snow', 'Snow,Ice Pellets,Blowing Snow',
       'Moderate Snow,Blowing Snow', 'Snow Showers,Fog',
       'Freezing Drizzle,Snow Grains', 'Snow Grains,Fog', 'Rain Showers',
       'Rain,Snow,Fog', 'Moderate Rain,Fog', 'Snow,Snow Grains,Fog',
       'Rain Showers,Snow Showers,Fog', 'Rain Showers,Fog',
       'Rain Showers,Haze', 'Rain,Haze', 'Thunderstorms,Haze',
       'Thunderstorms,Rain Showers',
       'Thunderstorms,Moderate Rain Sh

In [52]:
clean_pre_df['weather'].nunique()

157

dropping weather attribute due to high number values

In [55]:
clean_pre_df.drop(columns='weather', inplace=True)

In [56]:
clean_pre_df.head()

Unnamed: 0_level_0,temp,dew_point_temp,rel_hum,wind_speed,visibility,press,wind_chill,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,hourly_demand,hmdxx
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1994-01-01 00:00:00,-1.2,-3.8,83.0,15.0,19.3,99.91,-6.0,0.0,1994.0,1.0,5.0,1.0,52.0,1.0,14422.0,-4.1915
1994-01-01 01:00:00,-0.9,-3.0,86.0,20.0,16.1,99.91,-6.0,1.0,1994.0,1.0,5.0,1.0,52.0,1.0,13845.0,-3.734133
1994-01-01 02:00:00,-0.7,-3.2,83.0,15.0,16.1,99.87,-5.0,2.0,1994.0,1.0,5.0,1.0,52.0,1.0,13372.0,-3.574269
1994-01-01 03:00:00,-0.8,-2.4,89.0,15.0,12.9,99.81,-5.0,3.0,1994.0,1.0,5.0,1.0,52.0,1.0,13025.0,-3.510459
1994-01-01 04:00:00,-1.0,-3.3,84.0,19.0,16.1,99.77,-6.0,4.0,1994.0,1.0,5.0,1.0,52.0,1.0,12869.0,-3.894137


In [57]:
clean_pre_df.shape

(219144, 16)

In [64]:
def check_null_values():
  null_counts = clean_pre_df.isnull().sum()
  total_rows = len(clean_pre_df)
  null_percent = (null_counts/total_rows)*100
  print(null_percent)

In [None]:
check_null_values()

In [61]:
clean_pre_df.drop(columns='wind_chill', inplace = True)

In [65]:
check_null_values()

temp              0.0
dew_point_temp    0.0
rel_hum           0.0
wind_speed        0.0
visibility        0.0
press             0.0
hour_of_day       0.0
year              0.0
month             0.0
day_of_week       0.0
day_of_year       0.0
week_of_year      0.0
quarter           0.0
hourly_demand     0.0
hmdxx             0.0
dtype: float64


In [66]:
clean_pre_df.head()

Unnamed: 0_level_0,temp,dew_point_temp,rel_hum,wind_speed,visibility,press,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,hourly_demand,hmdxx
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1994-01-01 00:00:00,-1.2,-3.8,83.0,15.0,19.3,99.91,0.0,1994.0,1.0,5.0,1.0,52.0,1.0,14422.0,-4.1915
1994-01-01 01:00:00,-0.9,-3.0,86.0,20.0,16.1,99.91,1.0,1994.0,1.0,5.0,1.0,52.0,1.0,13845.0,-3.734133
1994-01-01 02:00:00,-0.7,-3.2,83.0,15.0,16.1,99.87,2.0,1994.0,1.0,5.0,1.0,52.0,1.0,13372.0,-3.574269
1994-01-01 03:00:00,-0.8,-2.4,89.0,15.0,12.9,99.81,3.0,1994.0,1.0,5.0,1.0,52.0,1.0,13025.0,-3.510459
1994-01-01 04:00:00,-1.0,-3.3,84.0,19.0,16.1,99.77,4.0,1994.0,1.0,5.0,1.0,52.0,1.0,12869.0,-3.894137


In [68]:
clean_pre_df.shape

(219144, 15)

In [69]:
clean_pre_df.to_csv(clean_data_path/"clean_preprocessed_02.csv")