In [58]:
import pandas as pd
import os
import torch
from torch.utils.data import Dataset
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader
from torch import nn
import pytorch_lightning as L
from pytorch_lightning import seed_everything, LightningModule, Trainer
import numpy as np

In [59]:
INPUT_SIZE =50
BATCH_SIZE=100
LEARNING_RATE=0.0001
WINDOW_SIZE = 720
LOG_PATH = os.getcwd()
DEVICE = torch.device("mps") if torch.backends.mps.is_available() else  torch.device("cpu")
NUM_WORKERS = os.cpu_count()
print(f'Device:{DEVICE}, Number of workers:{NUM_WORKERS}\nCurrent Dir:{LOG_PATH}')

Device:mps, Number of workers:8
Current Dir:/Volumes/WorkSpace/PythonProjects/timeSerise


In [60]:
DATA_DIR = '/Volumes/Data/Row_Data'
LOAD_FILE = 'kpx_powerDemand_2013_2023_v4.csv'
URL = f'{DATA_DIR}/{LOAD_FILE}'

In [61]:
df_data = pd.read_csv(URL, parse_dates=['date_time'], index_col='date_time')
df_data

Unnamed: 0_level_0,demand,month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,...,hr_20,hr_21,hr_22,hr_23,temp_1,temp_2,temp_3,temp_4,temp_5,temp_6
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-01 00:00:00,60178,True,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
2013-01-01 01:00:00,57862,True,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
2013-01-01 02:00:00,56165,True,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
2013-01-01 03:00:00,55135,True,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
2013-01-01 04:00:00,54450,True,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-31 19:00:00,62902,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
2023-12-31 20:00:00,62061,False,False,False,False,False,False,False,False,False,...,True,False,False,False,True,False,False,False,False,False
2023-12-31 21:00:00,61403,False,False,False,False,False,False,False,False,False,...,False,True,False,False,True,False,False,False,False,False
2023-12-31 22:00:00,61891,False,False,False,False,False,False,False,False,False,...,False,False,True,False,True,False,False,False,False,False


In [None]:
df_data.info()

In [62]:
transformer_demand = MinMaxScaler()
transformer_demand.fit( df_data[['demand']] )

In [63]:
class PowerDemandDataset(Dataset):

    def __init__(self, features, targets=[], window_size=WINDOW_SIZE):
        self.features = features
        self.targets = targets
        self.window_size= window_size
        self.x, self.y = [], []
        # 각 시간간격별 예측시  1시간뒤: x = 0 , 2시간뒤: x = 1, 3시간뒤: x = 2 ...
        #
        # for i in range(len(features) - self.window_size - X):
        #     v = features.iloc[ i: (i + window_size) ].values
        #     self.x.append(v)
        #     self.y.append( target.iloc[ i + window_size + x] )

        
        for i in range(len(features) - self.window_size ):
            v = features.iloc[ i: (i + window_size) ].values
            self.x.append(v)
            self.y.append( features.iloc[ i + window_size ]['demand'] )
            #self.y.append( target[ i + window_size ] )

        # Many to Many output_size = 6
       # for i in range(len(features) - self.window_size - 6 + 1):
       #     self.x.append( features.iloc[ i: (i + window_size) ].values )
       #     a = target[  i + window_size :  i + window_size + 6].values.reshape(-1, 1)
       #     self.y.append( a)

    
    def __getitem__(self, index):
        x = self.x[index].astype(np.float32)
        y = self.y[index].astype(np.float32)
        return x, y

    def __len__(self):
        return len(self.x) 


In [78]:
class PowerDemandDataModule(L.LightningDataModule):
    def __init__(self, data, transformer):
        super().__init__()
        self.df_data = data
        self.transformer = transformer
        
    def setup(self, stage=None):  
        self.train_data = self.df_data.loc['2013-01-01':'2020-12-31']
        self.val_data = self.df_data.loc['2020-01-01':'2022-12-31']   
        self.test_data = self.df_data.loc['2023-01-01':'2023-12-31']

    def get_dataset(self, stage=None):
        self.setup(stage=stage)
        match stage:
            case 'train':
                targets = self.transformer.transform(self.train_data[['demand']])
                return PowerDemandDataset(features=self.train_data , targets=targets) 
            case 'val':
                targets = self.transformer.transform(self.val_data[['demand']])
                return PowerDemandDataset(features=self.val_data , targets=targets)
            case 'test':
                targets = self.transformer.transform(self.test_data[['demand']])
                return PowerDemandDataset(features=self.test_data , targets=targets)
            case _:
                targets = self.transformer.transform(self.train_data[['demand']])
                return PowerDemandDataset(features=self.train_data , targets=targets)   
            
    
    def train_dataloader(self):
        return DataLoader( self.get_dataset('train'), batch_size=BATCH_SIZE )

    def val_dataloader(self):
        return DataLoader( self.get_dataset('val'), batch_size=BATCH_SIZE )

    def test_dataloader(self):
        return DataLoader( self.get_dataset('test'), batch_size=BATCH_SIZE )

    def teardown(self, stage=None):
        import gc
        
        match stage:
            case 'test':
                del self.test_data
            case 'train':
                del self.train_data
            case 'val':
                del self.val_data
            case _:
                del self.test_data
                del self.train_data
                del self.val_data
                
        gc.collect()


            

In [73]:
class PowerDemandPrediction(L.LightningModule):
    def __init__(self, input_size=INPUT_SIZE, output_size=1, hidden_dim=10, n_layers=2, lr=LEARNING_RATE, window_size=WINDOW_SIZE):
        super(PowerDemandPrediction, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.lstm = nn.LSTM(input_size, hidden_dim, n_layers, bidirectional=False, batch_first=True)
        self.fc = nn.Linear(hidden_dim * window_size, output_size)
        self.loss = nn.MSELoss()
        self.flatten = nn.Flatten()
        self.learning_rate = lr
    
    def forward(self, x):
        batch_size = x.size(0)
        hidden = self.init_hidden(batch_size)    
        lstm_out, (hidden, cell) = self.lstm(x, hidden)
        out = self.flatten(lstm_out)
        out = self.fc(out)
        return out

    def init_hidden(self, batch_size):   
        hidden_state = torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(DEVICE)
        cell_state = torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(DEVICE)
        hidden = (hidden_state, cell_state)
        return hidden

    def configure_optimizers(self):
        return {'optimizer': torch.optim.Adam(params=self.parameters(), lr = self.learning_rate) }

    def training_step(self, train_batch, batch_idx):
        features, targets = train_batch
        output = self(features) 
        output = output.view(-1)
        loss = self.loss(output, targets)
        self.log('train_loss', loss, prog_bar=True)
        return {"loss": loss}
            
    def validation_step(self, val_batch, batch_idx):
        features, targets = val_batch
        output = self(features) 
        output = output.view(-1)
        loss = self.loss(output, targets)
        self.log('val_loss', loss, prog_bar=True)

    def predict_step(self, batch, batch_idx):
        features, targets = batch
        result = self(features)
        return result.view(-1).tolist()
        

In [9]:
trainer = L.Trainer( fast_dev_run=True, accelerator='mps', devices=1)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Running in `fast_dev_run` mode: will run the requested loop using 1 batch(es). Logging and checkpointing is suppressed.


In [74]:
trainer = L.Trainer( default_root_dir=LOG_PATH, precision="16", max_epochs=10, accelerator='mps', devices=1)

/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/lightning_fabric/connector.py:571: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [11]:
model = PowerDemandPrediction().to(DEVICE)
model

PowerDemandPrediction(
  (lstm): LSTM(50, 10, num_layers=2, batch_first=True)
  (fc): Linear(in_features=7200, out_features=1, bias=True)
  (loss): MSELoss()
  (flatten): Flatten(start_dim=1, end_dim=-1)
)

In [79]:
dm = PowerDemandDataModule(df_data,transformer_demand)


In [70]:
#let's loop it over single iteration and print the shape and also data
for i, (features,targets) in enumerate(dm.get_dataset('test')):
    print("Size of the features",features.shape)
    print("Printing features:\n", features)
    print("Size of the targets",targets.shape)
    print("Printing targets:\n", targets)
    break

Size of the features (720, 50)
Printing features:
 [[6.4322e+04 1.0000e+00 0.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [6.1879e+04 1.0000e+00 0.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [6.0133e+04 1.0000e+00 0.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 ...
 [7.5472e+04 1.0000e+00 0.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [7.5278e+04 1.0000e+00 0.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [7.4863e+04 1.0000e+00 0.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]]
Size of the targets ()
Printing targets:
 71110.0


In [13]:
trainer.fit( model, dm)


  | Name    | Type    | Params | Mode 
--------------------------------------------
0 | lstm    | LSTM    | 3.4 K  | train
1 | fc      | Linear  | 7.2 K  | train
2 | loss    | MSELoss | 0      | train
3 | flatten | Flatten | 0      | train
--------------------------------------------
10.6 K    Trainable params
0         Non-trainable params
10.6 K    Total params
0.042     Total estimated model params size (MB)
4         Modules in train mode
0         Modules in eval mode


Sanity Checking: |                                                            | 0/? [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.val_data.demand = self.transformer.transform(self.val_data[['demand']])
/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.train_data.demand = self.transformer.transform(se

Training: |                                                                   | 0/? [00:00<?, ?it/s]



Validation: |                                                                 | 0/? [00:00<?, ?it/s]

Validation: |                                                                 | 0/? [00:00<?, ?it/s]

Validation: |                                                                 | 0/? [00:00<?, ?it/s]

Validation: |                                                                 | 0/? [00:00<?, ?it/s]

Validation: |                                                                 | 0/? [00:00<?, ?it/s]

Validation: |                                                                 | 0/? [00:00<?, ?it/s]

Validation: |                                                                 | 0/? [00:00<?, ?it/s]

Validation: |                                                                 | 0/? [00:00<?, ?it/s]

Validation: |                                                                 | 0/? [00:00<?, ?it/s]

Validation: |                                                                 | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


In [15]:
PATH = '/Volumes/WorkSpace/PythonProjects/timeSerise/lightning_logs/version_30/checkpoints/epoch=9-step=6950.ckpt'
model = PowerDemandPrediction.load_from_checkpoint(PATH).to(DEVICE)
model.eval()

PowerDemandPrediction(
  (lstm): LSTM(50, 10, num_layers=2, batch_first=True)
  (fc): Linear(in_features=7200, out_features=1, bias=True)
  (loss): MSELoss()
  (flatten): Flatten(start_dim=1, end_dim=-1)
)

In [88]:
dm.teardown('test')

In [89]:
predictions = trainer.predict(model, dm.test_dataloader())

/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Predicting: |                                                                 | 0/? [00:00<?, ?it/s]



In [18]:
predicted_result = sum(predictions, [])

In [None]:
dm.setup('test')
actual_result = []

for i, (features,targets) in enumerate(dm.test_dataloader()):
    targets = targets.to(DEVICE)
    actual_result.extend(targets.view(-1).tolist())

actual_result

In [22]:
df_before = df_data.loc['2023-01-30':'2023-12-30']['demand']
df_before.index = pd.date_range('2023-01-31 00:00:00', '2023-12-31 23:00:00', freq='1h')
df_before

2023-01-31 00:00:00    65069
2023-01-31 01:00:00    62933
2023-01-31 02:00:00    61792
2023-01-31 03:00:00    61302
2023-01-31 04:00:00    62034
                       ...  
2023-12-31 19:00:00    64934
2023-12-31 20:00:00    63817
2023-12-31 21:00:00    62436
2023-12-31 22:00:00    63010
2023-12-31 23:00:00    62981
Freq: h, Name: demand, Length: 8040, dtype: int64

In [35]:
actual_predicted_df = pd.DataFrame(data={"actual":actual_result, "predicted": predicted_result})
inverse_transformed_values = transformer_demand.inverse_transform(actual_predicted_df)
actual_predicted_df["actual"] = inverse_transformed_values[:,[0]]
actual_predicted_df["predicted"] = inverse_transformed_values[:,[1]]

# window_size= 720 #(24 Hour  * 30Day) 을 뺀 일수 
actual_predicted_df.index = pd.date_range('2023-01-31 00:00:00', '2023-12-31 23:00:00', freq='1h')

actual_predicted_df['before'] = df_before
actual_predicted_df

Unnamed: 0,actual,predicted,before
2023-01-31 00:00:00,71109.998834,63787.463787,65069
2023-01-31 01:00:00,68704.999390,61947.727665,62933
2023-01-31 02:00:00,67449.999146,60676.360193,61792
2023-01-31 03:00:00,66689.999856,60479.074662,61302
2023-01-31 04:00:00,66998.000372,60784.379727,62034
...,...,...,...
2023-12-31 19:00:00,62901.999208,58346.138694,64934
2023-12-31 20:00:00,62060.999162,58217.924113,63817
2023-12-31 21:00:00,61402.999795,57203.069167,62436
2023-12-31 22:00:00,61890.999869,56524.867806,63010
