In [1]:
# import all necessary libraries
import pandas as pd
from functools import reduce
import git

In [2]:
### Extracting Data
# define the file paths
base_root = git.Repo('.', search_parent_directories=True).working_tree_dir
base_input = base_root + '\\Input & Output\\Input\\'
base_output = base_root + '\\Input & Output\\Output\\'

demand_output = base_output + 'Demand\\'
supply_output = base_output + 'Supply\\'
market_price_output = base_output + 'Market Price\\'
weather_output = base_output + 'Weather\\'
anaylsis_input = base_input + 'Post Analysis\\'
population_input = base_input + 'Population\\'

# read in all data fields as dictionary; key: sheet name, value: sheet dataframe
demand_dict = pd.read_excel(demand_output + 'toronto_demand_time_series_data.xlsx', sheet_name = None)
supply_dict = pd.read_excel(supply_output + 'ontario_supply_time_series_data.xlsx', sheet_name = None)
price_dict = pd.read_excel(market_price_output + 'toronto_price_time_series_data.xlsx', sheet_name = None)
gas_price_dict = pd.read_excel(market_price_output + 'natural_gas_price_time_series_data.xlsx', sheet_name = None)
weather_dict = pd.read_excel(weather_output + 'weather_time_series_data.xlsx', sheet_name = None)
df_data_dictionary = pd.read_csv(anaylsis_input + 'data_dictionary.csv')
df_population = pd.read_excel(population_input + 'toronto_yearly_population.xlsx')


In [3]:
### Generating daily and monthly combined dataframes
lst_of_dict = [demand_dict, supply_dict, price_dict, gas_price_dict, weather_dict]

# extract daily and monthly average
lst_of_daily_df = []
lst_of_monthly_df = []
for dict in lst_of_dict:
    df_daily = dict.get('daily average')
    df_daily['Date'] = pd.to_datetime(df_daily['Date'])
    df_monthly = dict.get('monthly average')
    df_monthly['Date'] = pd.to_datetime(df_monthly['Date'])
    lst_of_daily_df.append(df_daily)
    lst_of_monthly_df.append(df_monthly)


# merge the list of dfs from all the dictionaries
def merge_dfs(left, right):
    return pd.merge(left, right, on = 'Date', how='left') 

df_all_field_daily = reduce(merge_dfs, lst_of_daily_df)
df_all_field_monthly = reduce(merge_dfs, lst_of_monthly_df)

In [4]:
### Pre-process the Data
# Define start and end dates
start_date = '2003-05-01'
end_date = '2025-4-30'
df_all_field_daily = df_all_field_daily[(df_all_field_daily['Date'] >= start_date) & (df_all_field_daily['Date'] <= end_date)]
df_all_field_monthly = df_all_field_monthly[(df_all_field_monthly['Date'] >= start_date) & (df_all_field_monthly['Date'] <= end_date)]


# standarlizing column names
rename_dict = {
    'avg_hourly_temperature': 'Temperature', 
    'avg_hourly_relative_humidity': 'Relative Humidity',
    'avg_hourly_dew_point': 'Dew Point',
    'avg_hourly_wind_speed': 'Wind Speed', 
    'avg_hourly_pressure_sea': 'Sea Level Pressure',
    'precipitation': 'Precipitation (mm)'
}
df_all_field_daily = df_all_field_daily.rename(columns = rename_dict)
df_all_field_monthly = df_all_field_monthly.rename(columns = rename_dict)

In [5]:
### Enriching the dataset
"""
This function separate the datatime column into 3 different columns, year, month, and dat for more information
This function also derive additional columns for more information: TempSquared & PrevDayDemand
"""

def enrich_dataset(df, level = 'day'):
    df['Date'] = pd.to_datetime(df['Date'])
    if level == 'day':
        df['Year'] = df['Date'].dt.year
        df['Month'] = df['Date'].dt.month
        df['Day'] = df['Date'].dt.day
        df['DayOfWeek'] = df['Date'].dt.dayofweek

        df['IsWeekend'] = df['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)
        # df['IsWeekend'] = 1 if df['DayOfWeek'] >= 5 else 0
    elif level == 'month':
        df['Year'] = df['Date'].dt.year
        df['Month'] = df['Date'].dt.month
        
    else:
        df['Year'] = df['Date'].dt.year
    df['TempSquared'] = df['Temperature'] ** 2
    df['PrevDayDemand'] = df['Toronto Demand'].shift(1)
    return df

df_all_field_daily = enrich_dataset(df_all_field_daily)
df_all_field_monthly = enrich_dataset(df_all_field_monthly, 'month')

# Merge population data based on column year - since population is yearly data
df_all_field_daily = df_all_field_daily.merge(df_population, how = 'left', on = 'Year')
df_all_field_monthly = df_all_field_monthly.merge(df_population, how = 'left', on = 'Year')

# Drop unwantted fields
df_all_field_daily = df_all_field_daily.drop('daylight', axis = 1)
df_all_field_monthly = df_all_field_monthly.drop('daylight', axis = 1)

# fill na as 0
df_all_field_daily = df_all_field_daily.fillna(0)
df_all_field_monthly = df_all_field_monthly.fillna(0)

In [None]:
### Outputting dataframes
anaylsis_output = base_output + 'Post Analysis\\'
# Output to excel
with pd.ExcelWriter(anaylsis_output + 'all_field_data_daily.xlsx') as writer:
    df_data_dictionary.to_excel(writer, sheet_name = 'Data Dictionary', index=False)

    df_all_field_daily.to_excel(writer, sheet_name='All Data Fields', index=False)

with pd.ExcelWriter(anaylsis_output + 'all_field_data_monthly.xlsx') as writer:
    df_data_dictionary.to_excel(writer, sheet_name = 'Data Dictionary', index=False)

    df_all_field_monthly.to_excel(writer, sheet_name='All Data Fields', index=False)

: 