## Exploratory Data Analysis

In [1]:
# --- Import necessary libraries ---
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import calendar
import ipywidgets as widgets
from ipywidgets import interact

# --- Set global Matplotlib parameters ---
plt.rcParams["font.family"] = "serif"
plt.rcParams["figure.dpi"] = 1000
plt.rcParams["font.size"] = 20
plt.rcParams["axes.labelsize"] = 20
plt.rcParams["axes.titlesize"] = 20
plt.rcParams["legend.fontsize"] = 20
plt.rcParams["xtick.direction"] = "in"
plt.rcParams["ytick.direction"] = "in"
plt.rcParams["xtick.major.size"] = 5.0
plt.rcParams["xtick.minor.size"] = 3.0
plt.rcParams["ytick.major.size"] = 5.0
plt.rcParams["ytick.minor.size"] = 3.0
plt.rcParams["axes.linewidth"] = 1.5
plt.rcParams["legend.handlelength"] = 2.0

In [2]:
# --- Load training data (Years 1 and 2) ---
train_df = pd.read_excel('.././datasets/training.xlsx', sheet_name='Data')

# --- Load testing data (Year 3)
test_df = pd.read_excel('.././datasets/testing.xlsx', sheet_name='Data')

### Descriptive Statistics

In [5]:
# --- Dynamically find the columns for temperature and GHI ---
load_col = ['Load']
temp_cols = sorted([col for col in train_df.columns if 'Temp' in col])
ghi_cols = sorted([col for col in train_df.columns if 'GHI' in col])
all_cols = load_col + temp_cols + ghi_cols

# --- Create a list to hold the statistics for each year ---
stats_list = []

# --- Loop through each year in the training data ---
for year in sorted(train_df['Year'].unique()):
    # Filter data for the current year
    year_df = train_df[train_df['Year'] == year][all_cols]
    
    # Calculate descriptive statistics
    stats = {
        'Mean': year_df.mean(),
        'Variance': year_df.var(),
        'Median': year_df.median(),
        'Min': year_df.min(),
        'Max': year_df.max(),
        'Skewness': year_df.skew(),
        'Kurtosis': year_df.kurt()
    }
    
    # Convert dictionary to DataFrame and set index
    stats_df = pd.DataFrame(stats).T
    stats_df['Year'] = year
    stats_df = stats_df.set_index('Year', append=True).reorder_levels([1, 0])
    stats_list.append(stats_df)

# --- Concatenate the results for all years into a single DataFrame ---
summary_stats = pd.concat(stats_list)

# --- Create the multi-level column headers for the final table ---
summary_stats.columns = pd.MultiIndex.from_tuples([
    ('Load', ''),
    ('Temperature', 'Site 1'), ('Temperature', 'Site 2'), ('Temperature', 'Site 3'), 
    ('Temperature', 'Site 4'), ('Temperature', 'Site 5'),
    ('Global Horizontal Irradiance', 'Site 1'), ('Global Horizontal Irradiance', 'Site 2'), 
    ('Global Horizontal Irradiance', 'Site 3'), ('Global Horizontal Irradiance', 'Site 4'), 
    ('Global Horizontal Irradiance', 'Site 5')
])

print("Descriptive Statistics for Training Data (Years 1 & 2)")
display(summary_stats.style.format('{:.2f}'))

Descriptive Statistics for Training Data (Years 1 & 2)


Unnamed: 0_level_0,Unnamed: 1_level_0,Load,Temperature,Temperature,Temperature,Temperature,Temperature,Global Horizontal Irradiance,Global Horizontal Irradiance,Global Horizontal Irradiance,Global Horizontal Irradiance,Global Horizontal Irradiance
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Site 1,Site 2,Site 3,Site 4,Site 5,Site 1,Site 2,Site 3,Site 4,Site 5
Year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,Mean,2162.82,17.31,17.17,18.27,17.72,17.6,225.8,222.47,226.91,227.06,228.39
1,Variance,216944.79,23.65,20.35,54.16,22.59,34.06,93137.48,90908.02,94454.84,94023.73,94894.82
1,Median,2072.0,17.5,17.4,17.3,17.8,17.3,12.0,12.0,12.0,12.0,13.0
1,Min,1101.0,1.9,2.9,-0.5,2.6,0.9,0.0,0.0,0.0,0.0,0.0
1,Max,4397.0,36.6,31.9,43.0,36.6,39.7,1037.0,1028.0,1041.0,1047.0,1049.0
1,Skewness,1.18,0.09,-0.0,0.45,0.2,0.42,1.07,1.09,1.09,1.08,1.08
1,Kurtosis,2.1,-0.14,-0.33,-0.26,-0.02,0.02,-0.25,-0.19,-0.2,-0.23,-0.23
2,Mean,2145.42,16.72,16.47,17.8,17.06,16.84,221.24,218.81,225.35,223.31,225.08
2,Variance,165227.38,20.09,17.45,49.22,18.96,28.81,91016.81,89239.26,93875.32,92757.19,93671.9
2,Median,2096.0,16.6,16.4,17.2,17.0,16.6,12.0,11.0,11.0,11.0,11.0


### Interactive Plots

#### Hourly Variation Analysis
