In [11]:
from pathlib import Path
from datetime import datetime
from os import PathLike
from typing import Union
import dateutil
import math

# data manipulation and analysis
import numpy as np 
import pandas as pd
from astral import Astral

# Visualization
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates
from matplotlib.dates import date2num
import seaborn as sns
from prettytable import PrettyTable
import plotly.express as px # An interactive graphing library that makes interactive, publication-quality graphs online.
import plotly.graph_objs as go # An interactive graphing library that makes interactive, publication-quality graphs online.

# ML model selection and evaluation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler

pd.set_option("display.max_rows", None, "display.max_columns", None)

In [38]:
clean_data_dir = '/Users/yashwanthkaruparthi/Developer/energy_demand/research/data/6-clean'
clean_data_path = Path(clean_data_dir)

usable_data_dir = '/Users/yashwanthkaruparthi/Developer/energy_demand/research/data/7-usable'
usable_data_path = Path(usable_data_dir)

In [22]:
df_hour = pd.read_csv(clean_data_path / 'for_todaily_conversion.csv')

In [23]:
df_hour['datetime'] = pd.to_datetime(df_hour['datetime'])

In [24]:
df_hour['date'] = df_hour['datetime'].dt.date

In [25]:
df_hour.head()

Unnamed: 0,datetime,temp,dew_point_temp,rel_hum,wind_speed,visibility,press,wind_chill,weather,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,hourly_demand,hmdxx,temp_min,temp_max,temp_mean,temp_median,temp_max_hour,temp_min_hour,dew_point_temp_min,dew_point_temp_max,dew_point_temp_mean,dew_point_temp_median,rel_hum_min,rel_hum_max,rel_hum_mean,rel_hum_median,visibility_min,visibility_max,visibility_mean,visibility_median,visibility_max_hour,visibility_min_hour,press_min,press_max,press_mean,press_median,hmdxx_min,hmdxx_max,hmdxx_mean,hmdxx_median,date
0,1994-01-01 00:00:00,-1.2,-3.8,83.0,15.0,19.3,99.91,-6.0,Cloudy,0.0,1994.0,1.0,5.0,1.0,52.0,1.0,14422.0,-4.1915,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,73.0,99.0,87.0,86.5,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,-4.976663,0.686009,-1.8849,-1.513575,1994-01-01
1,1994-01-01 01:00:00,-0.9,-3.0,86.0,20.0,16.1,99.91,-6.0,Cloudy,1.0,1994.0,1.0,5.0,1.0,52.0,1.0,13845.0,-3.734133,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,73.0,99.0,87.0,86.5,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,-4.976663,0.686009,-1.8849,-1.513575,1994-01-01
2,1994-01-01 02:00:00,-0.7,-3.2,83.0,15.0,16.1,99.87,-5.0,Cloudy,2.0,1994.0,1.0,5.0,1.0,52.0,1.0,13372.0,-3.574269,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,73.0,99.0,87.0,86.5,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,-4.976663,0.686009,-1.8849,-1.513575,1994-01-01
3,1994-01-01 03:00:00,-0.8,-2.4,89.0,15.0,12.9,99.81,-5.0,Cloudy,3.0,1994.0,1.0,5.0,1.0,52.0,1.0,13025.0,-3.510459,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,73.0,99.0,87.0,86.5,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,-4.976663,0.686009,-1.8849,-1.513575,1994-01-01
4,1994-01-01 04:00:00,-1.0,-3.3,84.0,19.0,16.1,99.77,-6.0,Mostly Cloudy,4.0,1994.0,1.0,5.0,1.0,52.0,1.0,12869.0,-3.894137,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,73.0,99.0,87.0,86.5,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,-4.976663,0.686009,-1.8849,-1.513575,1994-01-01


In [26]:
df_hour.drop(columns = 'weather', inplace=True)

In [27]:
df_hour.head()

Unnamed: 0,datetime,temp,dew_point_temp,rel_hum,wind_speed,visibility,press,wind_chill,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,hourly_demand,hmdxx,temp_min,temp_max,temp_mean,temp_median,temp_max_hour,temp_min_hour,dew_point_temp_min,dew_point_temp_max,dew_point_temp_mean,dew_point_temp_median,rel_hum_min,rel_hum_max,rel_hum_mean,rel_hum_median,visibility_min,visibility_max,visibility_mean,visibility_median,visibility_max_hour,visibility_min_hour,press_min,press_max,press_mean,press_median,hmdxx_min,hmdxx_max,hmdxx_mean,hmdxx_median,date
0,1994-01-01 00:00:00,-1.2,-3.8,83.0,15.0,19.3,99.91,-6.0,0.0,1994.0,1.0,5.0,1.0,52.0,1.0,14422.0,-4.1915,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,73.0,99.0,87.0,86.5,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,-4.976663,0.686009,-1.8849,-1.513575,1994-01-01
1,1994-01-01 01:00:00,-0.9,-3.0,86.0,20.0,16.1,99.91,-6.0,1.0,1994.0,1.0,5.0,1.0,52.0,1.0,13845.0,-3.734133,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,73.0,99.0,87.0,86.5,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,-4.976663,0.686009,-1.8849,-1.513575,1994-01-01
2,1994-01-01 02:00:00,-0.7,-3.2,83.0,15.0,16.1,99.87,-5.0,2.0,1994.0,1.0,5.0,1.0,52.0,1.0,13372.0,-3.574269,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,73.0,99.0,87.0,86.5,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,-4.976663,0.686009,-1.8849,-1.513575,1994-01-01
3,1994-01-01 03:00:00,-0.8,-2.4,89.0,15.0,12.9,99.81,-5.0,3.0,1994.0,1.0,5.0,1.0,52.0,1.0,13025.0,-3.510459,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,73.0,99.0,87.0,86.5,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,-4.976663,0.686009,-1.8849,-1.513575,1994-01-01
4,1994-01-01 04:00:00,-1.0,-3.3,84.0,19.0,16.1,99.77,-6.0,4.0,1994.0,1.0,5.0,1.0,52.0,1.0,12869.0,-3.894137,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,73.0,99.0,87.0,86.5,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,-4.976663,0.686009,-1.8849,-1.513575,1994-01-01


In [28]:
# df_hour.set_index(['datetime'], inplace = True)
df_hour.head()

Unnamed: 0,datetime,temp,dew_point_temp,rel_hum,wind_speed,visibility,press,wind_chill,hour_of_day,year,month,day_of_week,day_of_year,week_of_year,quarter,hourly_demand,hmdxx,temp_min,temp_max,temp_mean,temp_median,temp_max_hour,temp_min_hour,dew_point_temp_min,dew_point_temp_max,dew_point_temp_mean,dew_point_temp_median,rel_hum_min,rel_hum_max,rel_hum_mean,rel_hum_median,visibility_min,visibility_max,visibility_mean,visibility_median,visibility_max_hour,visibility_min_hour,press_min,press_max,press_mean,press_median,hmdxx_min,hmdxx_max,hmdxx_mean,hmdxx_median,date
0,1994-01-01 00:00:00,-1.2,-3.8,83.0,15.0,19.3,99.91,-6.0,0.0,1994.0,1.0,5.0,1.0,52.0,1.0,14422.0,-4.1915,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,73.0,99.0,87.0,86.5,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,-4.976663,0.686009,-1.8849,-1.513575,1994-01-01
1,1994-01-01 01:00:00,-0.9,-3.0,86.0,20.0,16.1,99.91,-6.0,1.0,1994.0,1.0,5.0,1.0,52.0,1.0,13845.0,-3.734133,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,73.0,99.0,87.0,86.5,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,-4.976663,0.686009,-1.8849,-1.513575,1994-01-01
2,1994-01-01 02:00:00,-0.7,-3.2,83.0,15.0,16.1,99.87,-5.0,2.0,1994.0,1.0,5.0,1.0,52.0,1.0,13372.0,-3.574269,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,73.0,99.0,87.0,86.5,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,-4.976663,0.686009,-1.8849,-1.513575,1994-01-01
3,1994-01-01 03:00:00,-0.8,-2.4,89.0,15.0,12.9,99.81,-5.0,3.0,1994.0,1.0,5.0,1.0,52.0,1.0,13025.0,-3.510459,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,73.0,99.0,87.0,86.5,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,-4.976663,0.686009,-1.8849,-1.513575,1994-01-01
4,1994-01-01 04:00:00,-1.0,-3.3,84.0,19.0,16.1,99.77,-6.0,4.0,1994.0,1.0,5.0,1.0,52.0,1.0,12869.0,-3.894137,-1.8,2.8,0.575,0.65,12.0,5.0,-4.8,1.1,-1.3875,-0.65,73.0,99.0,87.0,86.5,0.8,40.2,17.841667,16.1,11.0,14.0,98.51,99.91,99.075833,98.84,-4.976663,0.686009,-1.8849,-1.513575,1994-01-01


In [29]:
df_hour.columns

Index(['datetime', 'temp', 'dew_point_temp', 'rel_hum', 'wind_speed',
       'visibility', 'press', 'wind_chill', 'hour_of_day', 'year', 'month',
       'day_of_week', 'day_of_year', 'week_of_year', 'quarter',
       'hourly_demand', 'hmdxx', 'temp_min', 'temp_max', 'temp_mean',
       'temp_median', 'temp_max_hour', 'temp_min_hour', 'dew_point_temp_min',
       'dew_point_temp_max', 'dew_point_temp_mean', 'dew_point_temp_median',
       'rel_hum_min', 'rel_hum_max', 'rel_hum_mean', 'rel_hum_median',
       'visibility_min', 'visibility_max', 'visibility_mean',
       'visibility_median', 'visibility_max_hour', 'visibility_min_hour',
       'press_min', 'press_max', 'press_mean', 'press_median', 'hmdxx_min',
       'hmdxx_max', 'hmdxx_mean', 'hmdxx_median', 'date'],
      dtype='object')

In [31]:
# Define the aggregation methods for each type of column
aggregation_methods = {
    'temp': 'mean',
    'dew_point_temp': 'mean',
    'rel_hum': 'mean',
    'wind_speed': 'mean',
    'visibility': 'mean',
    'press': 'mean',
    'wind_chill': 'mean',
    'hourly_demand': 'mean',
    'hmdxx': 'mean',
    'temp_min': 'min',
    'temp_max': 'max',
    'temp_mean': 'mean',
    'temp_median': 'median',
    'dew_point_temp_min': 'min',
    'dew_point_temp_max': 'max',
    'dew_point_temp_mean': 'mean',
    'dew_point_temp_median': 'median',
    'rel_hum_min': 'min',
    'rel_hum_max': 'max',
    'rel_hum_mean': 'mean',
    'rel_hum_median': 'median',
    'visibility_min': 'min',
    'visibility_max': 'max',
    'visibility_mean': 'mean',
    'visibility_median': 'median',
    'press_min': 'min',
    'press_max': 'max',
    'press_mean': 'mean',
    'press_median': 'median',
    'hmdxx_min': 'min',
    'hmdxx_max': 'max',
    'hmdxx_mean': 'mean',
    'hmdxx_median': 'median'
}

# Group by date and aggregate
daily_data = df_hour.groupby('date').agg(aggregation_methods).reset_index()


In [32]:
daily_data.head()

Unnamed: 0,date,temp,dew_point_temp,rel_hum,wind_speed,visibility,press,wind_chill,hourly_demand,hmdxx,temp_min,temp_max,temp_mean,temp_median,dew_point_temp_min,dew_point_temp_max,dew_point_temp_mean,dew_point_temp_median,rel_hum_min,rel_hum_max,rel_hum_mean,rel_hum_median,visibility_min,visibility_max,visibility_mean,visibility_median,press_min,press_max,press_mean,press_median,hmdxx_min,hmdxx_max,hmdxx_mean,hmdxx_median
0,1994-01-01,0.575,-1.3875,87.0,20.333333,17.841667,99.075833,-5.888889,14655.208333,-1.8849,-1.8,2.8,0.575,0.65,-4.8,1.1,-1.3875,-0.65,73.0,99.0,87.0,86.5,0.8,40.2,17.841667,16.1,98.51,99.91,99.075833,98.84,-4.976663,0.686009,-1.8849,-1.513575
1,1994-01-02,-8.704167,-12.445833,74.416667,17.875,28.3875,99.92,-16.681818,15783.333333,-12.827363,-14.3,1.7,-8.704167,-9.9,-20.3,0.5,-12.445833,-13.9,60.0,92.0,74.416667,75.5,6.4,40.2,28.3875,25.0,98.77,100.51,99.92,100.125,-19.165499,-0.338394,-12.827363,-14.393998
2,1994-01-03,-12.6125,-15.820833,77.25,17.666667,10.404167,100.057083,-20.583333,18992.708333,-17.143932,-16.3,-10.3,-12.6125,-12.0,-23.2,-12.6,-15.820833,-15.0,55.0,91.0,77.25,80.5,1.2,25.0,10.404167,8.0,99.41,100.47,100.057083,100.155,-21.317386,-14.555717,-17.143932,-16.423198
3,1994-01-04,-9.833333,-13.570833,74.416667,25.791667,24.879167,98.675417,-18.541667,19080.708333,-14.183286,-11.5,-7.4,-9.833333,-9.95,-15.0,-11.5,-13.570833,-13.7,57.0,84.0,74.416667,76.0,9.7,40.2,24.879167,24.1,98.27,99.34,98.675417,98.61,-15.874948,-11.70666,-14.183286,-14.310177
4,1994-01-05,-11.866667,-16.154167,70.791667,22.875,27.033333,99.352083,-20.521739,19183.208333,-16.407695,-17.5,-7.2,-11.866667,-11.75,-20.8,-10.7,-16.154167,-17.0,56.0,84.0,70.791667,74.0,1.0,40.2,27.033333,25.0,98.44,100.37,99.352083,99.375,-22.394182,-11.256395,-16.407695,-16.464188


In this dataset, `'day_of_week', 'day_of_year', 'week_of_year', 'quarter'` are removed. If necessary for improving model performance add these

In [33]:
daily_data.shape

(9131, 34)

In [34]:
daily_data.tail()

Unnamed: 0,date,temp,dew_point_temp,rel_hum,wind_speed,visibility,press,wind_chill,hourly_demand,hmdxx,temp_min,temp_max,temp_mean,temp_median,dew_point_temp_min,dew_point_temp_max,dew_point_temp_mean,dew_point_temp_median,rel_hum_min,rel_hum_max,rel_hum_mean,rel_hum_median,visibility_min,visibility_max,visibility_mean,visibility_median,press_min,press_max,press_mean,press_median,hmdxx_min,hmdxx_max,hmdxx_mean,hmdxx_median
9126,2018-12-27,-2.541667,-6.575,74.166667,17.75,23.633333,100.791667,-9.166667,15698.625,-5.917272,-6.9,2.5,-2.541667,-1.95,-12.7,0.0,-6.575,-5.35,61.0,83.0,74.166667,77.0,12.9,24.1,23.633333,24.1,99.31,101.5,100.791667,101.075,-11.155717,0.336425,-5.917272,-5.212529
9127,2018-12-28,8.016667,6.254167,88.833333,15.75,17.204167,98.661667,,14809.208333,7.868511,2.7,12.4,8.016667,8.35,0.4,10.8,6.254167,7.25,74.0,95.0,88.833333,91.5,3.6,24.1,17.204167,24.1,98.37,99.21,98.661667,98.635,0.738694,13.912823,7.868511,8.262797
9128,2018-12-29,-2.525,-7.520833,68.875,19.916667,24.033333,99.90375,-9.722222,15135.791667,-6.06301,-7.0,3.6,-2.525,-3.05,-11.4,-0.6,-7.520833,-9.35,56.0,80.0,68.875,71.0,22.5,24.1,24.033333,24.1,98.85,100.43,99.90375,100.135,-11.125707,1.291505,-6.06301,-7.026121
9129,2018-12-30,-2.404167,-5.829167,77.666667,10.958333,19.358333,99.869167,-6.15,15326.625,-5.735336,-6.4,0.6,-2.404167,-1.6,-10.5,-2.8,-5.829167,-5.4,68.0,88.0,77.666667,75.5,1.6,24.1,19.358333,24.1,99.65,100.15,99.869167,99.87,-10.147759,-2.448366,-5.735336,-4.982904
9130,2018-12-31,1.204167,-1.645833,81.708333,10.458333,18.091667,99.3025,-3.0,15016.208333,-1.27833,-2.0,4.3,1.204167,2.25,-6.4,3.4,-1.645833,-1.3,65.0,94.0,81.708333,82.5,4.8,24.1,18.091667,24.1,97.37,100.03,99.3025,99.695,-5.229812,3.073132,-1.27833,0.062595


In [36]:
unique_dates_count = daily_data['date'].nunique()
total_rows = len(daily_data)

date_range = (daily_data['date'].min(), daily_data['date'].max())

original_summary = df_hour[['temp', 'dew_point_temp', 'rel_hum', 'wind_speed', 'visibility', 'press']].describe()
daily_summary = daily_data[['temp', 'dew_point_temp', 'rel_hum', 'wind_speed', 'visibility', 'press']].describe()

unique_dates_count, total_rows, date_range, original_summary, daily_summary


(9131,
 9131,
 (datetime.date(1994, 1, 1), datetime.date(2018, 12, 31)),
                 temp  dew_point_temp        rel_hum     wind_speed   
 count  219144.000000   219144.000000  219144.000000  219144.000000  \
 mean        8.926518        3.241653      69.773021      15.769875   
 std        11.017488       10.322492      16.304554       9.713914   
 min       -30.300000      -36.100000      13.000000       0.000000   
 25%         0.600000       -4.300000      59.000000       9.000000   
 50%         9.000000        3.400000      71.000000      15.000000   
 75%        18.100000       11.900000      82.000000      21.000000   
 max        37.500000       26.600000     100.000000      85.000000   
 
           visibility          press  
 count  219144.000000  219144.000000  
 mean       21.260437      99.517873  
 std         7.334402       0.765601  
 min         0.000000      95.910000  
 25%        19.300000      99.060000  
 50%        24.100000      99.540000  
 75%        2

In [39]:
daily_data.to_csv(usable_data_path / '1-hourly_data.csv')