In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


In [2]:
torch.cuda.get_device_name(torch.cuda.current_device())

'NVIDIA GeForce RTX 3060'

In [3]:
df = pd.read_csv('./Form-714-csv-files-June-2021/Part 3 Schedule 2 - Planning Area Hourly Demand.csv')
respondent_id = pd.read_csv('./Form-714-csv-files-June-2021/Respondent IDs.csv')
good_ids = respondent_id['respondent_id'].unique()[3:]
df = df[df['respondent_id'].isin(good_ids)]
hour_cols = [f'hour{i:02d}' for i in range(1, 25)]
df = df.loc[~(df[hour_cols] == 0).any(axis=1)]
df['timezone'] = df['timezone'].apply(lambda x: x.strip().upper())


In [4]:
timezones, counts = np.unique(df['timezone'].values.astype('str'), return_counts=True)
argsort = np.argsort(counts)[::-1]
print(counts[argsort])
print(timezones[argsort])
for ii in argsort:
    print(timezones[ii], counts[ii])

[130128 127461  73773  67919  49870  40665  38843  38732  11033  10197
   9469   6201   5476   4734   3312   3087   2559   2557   2191   2188
   2152   1658   1649    731    730    730    715    517    363    210
      3      2      1      1]
['EST' 'CST' 'PST' 'MST' 'CDT' 'CPT' 'PDT' 'EDT' 'DST' 'MDT' 'PPT' 'EPT'
 'HST' '2' 'CS' 'AKD' '1' '' 'MS' 'MPP' 'CD' 'EDS' 'AKS' 'MPT' 'EAS' 'CEN'
 'CDS' 'AST' 'E' 'ADT' 'CSR' '433' 'CTR' '206']
EST 130128
CST 127461
PST 73773
MST 67919
CDT 49870
CPT 40665
PDT 38843
EDT 38732
DST 11033
MDT 10197
PPT 9469
EPT 6201
HST 5476
2 4734
CS 3312
AKD 3087
1 2559
 2557
MS 2191
MPP 2188
CD 2152
EDS 1658
AKS 1649
MPT 731
EAS 730
CEN 730
CDS 715
AST 517
E 363
ADT 210
CSR 3
433 2
CTR 1
206 1


In [5]:
timezone_mapping = {
    "EST": "Etc/GMT+5", 
    "CST": "Etc/GMT+6", 
    "CS": "Etc/GMT+6", 
    "PST": "Etc/GMT+8", 
    "MST": "Etc/GMT+7", 
    "HST": "Etc/GMT+10",
    "AKST": "Etc/GMT+9",
    "EDT": "Etc/GMT+4",
    "CDT": "Etc/GMT+5",
    "CD": "Etc/GMT+5",
    "PDT": "Etc/GMT+7",
    "MDT": "Etc/GMT+6",
}
df['timezone'] = df['timezone'].apply(timezone_mapping.get)
df = df.dropna(subset=['timezone'])

In [6]:
timezones, counts = np.unique(df['timezone'].values.astype('str'), return_counts=True)
argsort = np.argsort(counts)[::-1]
print(counts[argsort])
print(timezones[argsort])
for ii in argsort:
    print(timezones[ii], counts[ii])

[182150 140970 106762  73773  38732   5476]
['Etc/GMT+5' 'Etc/GMT+6' 'Etc/GMT+7' 'Etc/GMT+8' 'Etc/GMT+4' 'Etc/GMT+10']
Etc/GMT+5 182150
Etc/GMT+6 140970
Etc/GMT+7 106762
Etc/GMT+8 73773
Etc/GMT+4 38732
Etc/GMT+10 5476


In [18]:
id_vars = ['respondent_id', 'plan_date', 'timezone']
hour_cols = [f'hour{ii:02}' for ii in range(1, 25)]
df = df[hour_cols + id_vars]
rename_dict = {a_col: int(a_col[-2:]) for a_col in hour_cols}
df.rename(columns=rename_dict, inplace=True)
df['plan_date'] = pd.to_datetime(df['plan_date'])
df['plan_date'] = df.apply(
    lambda row: row['plan_date'].tz_localize(
        row['timezone']).tz_convert('UTC'),
    axis=1
)
df = pd.melt(df, 
            id_vars=id_vars,
            value_vars=np.arange(1, 25),
            var_name='hour',
            value_name='load')

# Create local datetime
df['utc_datetime'] = df['plan_date'] + pd.to_timedelta(df['hour'], unit='h')
df = df.dropna(subset=['utc_datetime', 'load'])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns=rename_dict, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['plan_date'] = pd.to_datetime(df['plan_date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['plan_date'] = df.apply(


In [23]:
df['hour'] = df['utc_datetime'].dt.hour

In [29]:
means = df.groupby(['respondent_id', 'hour'])['load'].mean()
stds = df.groupby(['respondent_id', 'hour'])['load'].std()

In [38]:
means.groupby('hour').std().mean()

9866.438238750005

In [35]:
means.groupby('respondent_id').std().mean()

420.74436857807615

In [None]:
# Data Preparation
def prepare_data(df):
    # Feature engineering
    df['utc_hour'] = df['utc_datetime'].dt.hour
    df['utc_dayofweek'] = df['utc_datetime'].dt.dayofweek
    df['utc_month'] = df['utc_datetime'].dt.month
    df['utc_year'] = df['utc_datetime'].dt.year
    
    # Cyclic encoding
    df['utc_hour_sin'] = np.sin(2 * np.pi * df['utc_hour'] / 24)
    df['utc_hour_cos'] = np.cos(2 * np.pi * df['utc_hour'] / 24)
    df['utc_dayofweek_sin'] = np.sin(2 * np.pi * df['utc_dayofweek'] / 7)
    df['utc_dayofweek_cos'] = np.cos(2 * np.pi * df['utc_dayofweek'] / 7)
    df['utc_month_sin'] = np.sin(2 * np.pi * df['utc_month'] / 12)
    df['utc_month_cos'] = np.cos(2 * np.pi * df['utc_month'] / 12)
    
    # Normalize year
    df['utc_year'] = (df['utc_year'] - df['utc_year'].mean()) / df['utc_year'].std()
    
    # Encode respondents
    df['respondent_id'] = df['respondent_id'].astype('category')
    df['respondent_idx'] = df['respondent_id'].cat.codes
    
    return df


In [8]:
# df_prep = prepare_data(df)

In [9]:
# PyTorch Dataset
class LoadDataset(Dataset):
    def __init__(self, df):
        self.respondent_idx = torch.LongTensor(df['respondent_idx'].values)
        self.features = torch.FloatTensor(df[[
            'utc_hour_sin', 'utc_hour_cos',
            'utc_dayofweek_sin', 'utc_dayofweek_cos',
            'utc_month_sin', 'utc_month_cos',
            'utc_year'
        ]].values)
        self.targets = torch.FloatTensor(df['load'].values)
        
    def __len__(self):
        return len(self.targets)
    
    def __getitem__(self, idx):
        return (
            self.respondent_idx[idx],
            self.features[idx],
            self.targets[idx]
        )


In [None]:
# Neural Network Model
class LoadForecaster(nn.Module):
    def __init__(self, num_respondents, embedding_dim=64, hidden_dims=[128, 64, 32, 16]):
        super().__init__()
        # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.embedding = nn.Embedding(num_respondents, embedding_dim)
        # self.embedding.to(device)
        self.feature_dim = 7  # 7 temporal features
        
        self.net = nn.Sequential(
            nn.Linear(embedding_dim + self.feature_dim, hidden_dims[0]),
            nn.ReLU(),
            nn.Linear(hidden_dims[0], hidden_dims[1]),
            nn.ReLU(),
            nn.Linear(hidden_dims[1], hidden_dims[2]),
            nn.ReLU(),
            nn.Linear(hidden_dims[2], hidden_dims[3]),
            nn.ReLU(),
            nn.Linear(hidden_dims[3], 1)
        )
        
    def forward(self, respondent_idx, features):
        # print(respondent_idx.device)
        # print(self.embedding.device)
        embedded = self.embedding(respondent_idx)
        combined = torch.cat([embedded, features], dim=1)
        return self.net(combined).squeeze()


In [11]:
# Training Setup
def train_model(df, num_epochs=20, batch_size=1_000_000):
    # Prepare data
    # df = prepare_data(df)
    # df = df.sort_values('utc_datetime')
    
    # Split data
    train_size = int(0.8 * len(df))
    train_df = df.iloc[:train_size]
    val_df = df.iloc[train_size:]
    
    # Create datasets/dataloaders
    train_dataset = LoadDataset(train_df)
    val_dataset = LoadDataset(val_df)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    # Initialize model
    num_respondents = len(df['respondent_id'].cat.categories)
    model = LoadForecaster(num_respondents)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    # Training loop
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for ridx, features, targets in train_loader:
            ridx = ridx.to(device)
            features = features.to(device)
            targets = targets.to(device)
            optimizer.zero_grad()
            outputs = model(ridx, features)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * ridx.size(0)
        
        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for ridx, features, targets in val_loader:
                ridx = ridx.to(device)
                features = features.to(device)
                targets = targets.to(device)
                outputs = model(ridx, features)
                val_loss += criterion(outputs, targets).item() * ridx.size(0)
        
        # Print statistics
        train_loss = train_loss / len(train_dataset)
        val_loss = val_loss / len(val_dataset)
        print(f'Epoch {epoch+1:2} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}')
    
    return model

In [12]:
df_prep = prepare_data(df)
df_prep = df_prep.sort_values('utc_datetime')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns=rename_dict, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['plan_date'] = pd.to_datetime(df['plan_date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['plan_date'] = df.apply(


In [13]:
model = train_model(df_prep)

Epoch  1 | Train Loss: 144180872.7178 | Val Loss: 164716475.0125
Epoch  2 | Train Loss: 144179126.6507 | Val Loss: 164712984.2106
Epoch  3 | Train Loss: 144172274.9683 | Val Loss: 164698673.2960
Epoch  4 | Train Loss: 144145532.8302 | Val Loss: 164646598.2056
Epoch  5 | Train Loss: 144053899.7229 | Val Loss: 164480464.7528
Epoch  6 | Train Loss: 143778018.4159 | Val Loss: 164015687.8396
Epoch  7 | Train Loss: 143042406.7607 | Val Loss: 162840644.1520
Epoch  8 | Train Loss: 141275213.6066 | Val Loss: 160155465.6589
Epoch  9 | Train Loss: 137460103.2774 | Val Loss: 154628098.4353
Epoch 10 | Train Loss: 130113458.0864 | Val Loss: 144501121.2649
Epoch 11 | Train Loss: 117893168.5612 | Val Loss: 128462440.3824
Epoch 12 | Train Loss: 100929617.7755 | Val Loss: 107436421.4132
Epoch 13 | Train Loss: 82393959.6603 | Val Loss: 85979357.5755
Epoch 14 | Train Loss: 66505885.7505 | Val Loss: 69083856.0508
Epoch 15 | Train Loss: 53770160.3376 | Val Loss: 56596217.6811
Epoch 16 | Train Loss: 42839246

In [None]:
# torch.save(model.state_dict(), 'load_forecaster_model.pth')