---
# Setup
---

**Add project_files to system path**

In [1]:
import sys
if not 'Time-Series-Forcasting-Group3' in sys.path:
    sys.path += ['Time-Series-Forcasting-Group3']
    
sys.path

['/home/jovyan/new_mohamed/Final-Time-Series-Forecasting',
 '/opt/conda/lib/python39.zip',
 '/opt/conda/lib/python3.9',
 '/opt/conda/lib/python3.9/lib-dynload',
 '',
 '/opt/conda/lib/python3.9/site-packages',
 'Time-Series-Forcasting-Group3']

**Important library**

In [2]:
import torch
from torch.utils.data import DataLoader
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
import os
from exp.exp_PatchTST import Exp_Main

In [4]:
class dotdict(dict):
    """dot.notation access to dictionary attributes"""
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

---
# Working on ETTh1 Dataset
---

## Trail 1: PatchTST, Dataset:ETTh1,  Metric: 96
### Set hyperparameters
Set some parameters (Args) for the our experiment like dictionary


In [5]:
"""
    **dotdict function**
    This function is used to convert a dictionary into
    an object whose keys can be accessed as attributes
"""

args = dotdict()

args.model = 'PatchTST'
args.random_seed = 2021
args.is_training = 1
args.model_id = f"{args.data}_{args.seq_len}_{args.pred_len}"
args.fc_dropout = 0.3
args.head_dropout = 0
args.patch_len = 16
args.stride = 8
args.batch_size = 128
args.learning_rate = 0.0001


args.use_multi_gpu = False
args.use_gpu = True if torch.cuda.is_available() else False
args.learning_rate = 0.005
args.pred_len = 96 # prediction sequence length
args.label_len = 48 # start token length of PatchTST decoder
args.use_amp = False # whether to use automatic mixed precision training
args.output_attention = False # whether to output attention in ecoder
args.features = 'M' # forecasting task, options:[M, S, MS]; M:multivariate predict multivariate, S:univariate predict univariate, MS:multivariate predict univariate
args.train_only=True
args.checkpoints = './Checkpoints/PatchTST_checkpoints' # location of model checkpoints
args.patience = 3
args.train_epochs =100# 6

args.data = 'ETTh1'  # data
args.root_path = './Datasets/' # root path of data file
args.data_path = 'ETTh1.csv' # data file
args.target = 'OT' # target feature in S or MS task
args.freq = 'h' # freq for time features encoding, options:[s:secondly, t:minutely, h:hourly, d:daily, b:business days, w:weekly, m:monthly], you can also use more detailed freq like 15min or 3h
args.seq_len = 336 # input sequence length of PatchTST encoder

# PatchTST decoder input: concat[start token series(label_len), zero padding series(pred_len)]
args.enc_in = 7 # encoder input size
args.dec_in = 7 # decoder input size
args.c_out = 7 # output size
args.factor = 5 # probsparse attn factor
args.d_model =16# 512 # dimension of model
args.n_heads = 4#8 # num of heads
args.e_layers = 3#2 # num of encoder layers
args.d_layers = 1 # num of decoder layers
args.d_ff = 128#2048 # dimension of fcn in model
args.dropout =0.3# 0.05 # dropout
args.attn = 'prob' # attention used in encoder, options:[prob, full]
args.embed = 'timeF' # time features encoding, options:[timeF, fixed, learned]
args.activation = 'gelu' # activation
args.distil = True # whether to use distilling in encoder
args.mix = True
args.padding = 0
# args.freq = 'h'
args.batch_size = 32 
args.loss = 'mse'
args.lradj = 'type1'
args.num_workers = 0
args.itr = 1
args.des = "Exp"#'exp'
args.gpu = 0
args.devices = '0,1,2,3'

In [6]:
args.use_gpu = True if torch.cuda.is_available() and args.use_gpu else False
if args.use_gpu and args.use_multi_gpu:
    args.devices = args.devices.replace(' ','')
    device_ids = args.devices.split(',')
    args.device_ids = [int(id_) for id_ in device_ids]
    args.gpu = args.device_ids[0]
    
print("Hyperparameter Combination of Trail 1: \n") 
print(args)

Hyperparameter Combination of Trail 1: 

{'model': 'PatchTST', 'random_seed': 2021, 'is_training': 1, 'model_id': 'None_None_None', 'fc_dropout': 0.3, 'head_dropout': 0, 'patch_len': 16, 'stride': 8, 'batch_size': 32, 'learning_rate': 0.005, 'use_multi_gpu': False, 'use_gpu': True, 'pred_len': 96, 'label_len': 48, 'use_amp': False, 'output_attention': False, 'features': 'M', 'train_only': True, 'checkpoints': './Checkpoints/PatchTST_checkpoints', 'patience': 3, 'train_epochs': 100, 'data': 'ETTh1', 'root_path': './Datasets/', 'data_path': 'ETTh1.csv', 'target': 'OT', 'freq': 'h', 'seq_len': 336, 'enc_in': 7, 'dec_in': 7, 'c_out': 7, 'factor': 5, 'd_model': 16, 'n_heads': 4, 'e_layers': 3, 'd_layers': 1, 'd_ff': 128, 'dropout': 0.3, 'attn': 'prob', 'embed': 'timeF', 'activation': 'gelu', 'distil': True, 'mix': True, 'padding': 0, 'loss': 'mse', 'lradj': 'type1', 'num_workers': 0, 'itr': 1, 'des': 'Exp', 'gpu': 0, 'devices': '0,1,2,3'}


### Training

In [10]:
Exp = Exp_Main
setting=f'PatchTST_train_on_{args.data}_{args.pred_len}'
# set experiments
exp = Exp(args)
exp.train(setting)

Use GPU: cuda:0
train 8209
val 2785
test 2785
	iters: 100, epoch: 1 | loss: 0.5151863
	speed: 0.0759s/iter; left time: 1936.5192s
	iters: 200, epoch: 1 | loss: 0.4441786
	speed: 0.0671s/iter; left time: 1704.7053s
Epoch: 1 cost time: 18.41019082069397
Epoch: 1, Steps: 256 | Train Loss: 0.5304114 Vali Loss: 0.7279106 Test Loss: 0.4189521
Validation loss decreased (inf --> 0.727911).  Saving model ...
Updating learning rate to 0.005
	iters: 100, epoch: 2 | loss: 0.4038369
	speed: 0.1581s/iter; left time: 3991.8206s
	iters: 200, epoch: 2 | loss: 0.4288949
	speed: 0.0710s/iter; left time: 1786.2780s
Epoch: 2 cost time: 17.306489944458008
Epoch: 2, Steps: 256 | Train Loss: 0.4001373 Vali Loss: 0.7182564 Test Loss: 0.4168549
Validation loss decreased (0.727911 --> 0.718256).  Saving model ...
Updating learning rate to 0.0025
	iters: 100, epoch: 3 | loss: 0.3390920
	speed: 0.1779s/iter; left time: 4445.9986s
	iters: 200, epoch: 3 | loss: 0.2963939
	speed: 0.0740s/iter; left time: 1842.7161s
E

Model(
  (model): PatchTST_backbone(
    (backbone): TSTiEncoder(
      (W_P): Linear(in_features=16, out_features=16, bias=True)
      (dropout): Dropout(p=0.3, inplace=False)
      (encoder): TSTEncoder(
        (layers): ModuleList(
          (0-2): 3 x TSTEncoderLayer(
            (self_attn): _MultiheadAttention(
              (W_Q): Linear(in_features=16, out_features=16, bias=True)
              (W_K): Linear(in_features=16, out_features=16, bias=True)
              (W_V): Linear(in_features=16, out_features=16, bias=True)
              (sdp_attn): _ScaledDotProductAttention(
                (attn_dropout): Dropout(p=0.0, inplace=False)
              )
              (to_out): Sequential(
                (0): Linear(in_features=16, out_features=16, bias=True)
                (1): Dropout(p=0.3, inplace=False)
              )
            )
            (dropout_attn): Dropout(p=0.3, inplace=False)
            (norm_attn): Sequential(
              (0): Transpose()
              (1)

### Testing

In [13]:
exp.test(setting)
torch.cuda.empty_cache()

test 2785
mse:0.3932727575302124, mae:0.4153849184513092, rse:0.5956631302833557


---
## Trail 2: PatchTST, Dataset:ETTh1 , Metric: 192
### Set hyperparameters
Set some parameters (Args) for the our experiment like dictionary

In [15]:
args.pred_len = 192 # prediction sequence length
print(f"Dataset: {args.data},  Prediction Length : {args.pred_len}") 
# print(args)

Dataset: ETTh1,  Prediction Length : 192


### Training

In [16]:
Exp = Exp_Main
args.pred_len = 192
setting=f'PatchTST_train_on_{args.data}_{args.pred_len}'
# set experiments
exp = Exp(args)
exp.train(setting)

Use GPU: cuda:0
train 8113
val 2689
test 2689
	iters: 100, epoch: 1 | loss: 0.4972065
	speed: 0.0748s/iter; left time: 1884.9088s
	iters: 200, epoch: 1 | loss: 0.4336420
	speed: 0.0728s/iter; left time: 1827.9767s
Epoch: 1 cost time: 18.494192838668823
Epoch: 1, Steps: 253 | Train Loss: 0.5782521 Vali Loss: 0.9129704 Test Loss: 0.4498765
Validation loss decreased (inf --> 0.912970).  Saving model ...
Updating learning rate to 0.005
	iters: 100, epoch: 2 | loss: 0.4021264
	speed: 0.1832s/iter; left time: 4569.8993s
	iters: 200, epoch: 2 | loss: 0.4054012
	speed: 0.0658s/iter; left time: 1635.9973s
Epoch: 2 cost time: 17.123370885849
Epoch: 2, Steps: 253 | Train Loss: 0.4446002 Vali Loss: 1.1366206 Test Loss: 0.4531257
EarlyStopping counter: 1 out of 3
Updating learning rate to 0.0025
	iters: 100, epoch: 3 | loss: 0.3814523
	speed: 0.1810s/iter; left time: 4469.4422s
	iters: 200, epoch: 3 | loss: 0.4221437
	speed: 0.0712s/iter; left time: 1751.7776s
Epoch: 3 cost time: 17.890690326690674

Model(
  (model): PatchTST_backbone(
    (backbone): TSTiEncoder(
      (W_P): Linear(in_features=16, out_features=16, bias=True)
      (dropout): Dropout(p=0.3, inplace=False)
      (encoder): TSTEncoder(
        (layers): ModuleList(
          (0-2): 3 x TSTEncoderLayer(
            (self_attn): _MultiheadAttention(
              (W_Q): Linear(in_features=16, out_features=16, bias=True)
              (W_K): Linear(in_features=16, out_features=16, bias=True)
              (W_V): Linear(in_features=16, out_features=16, bias=True)
              (sdp_attn): _ScaledDotProductAttention(
                (attn_dropout): Dropout(p=0.0, inplace=False)
              )
              (to_out): Sequential(
                (0): Linear(in_features=16, out_features=16, bias=True)
                (1): Dropout(p=0.3, inplace=False)
              )
            )
            (dropout_attn): Dropout(p=0.3, inplace=False)
            (norm_attn): Sequential(
              (0): Transpose()
              (1)

### Testing

In [17]:
exp.test(setting)
torch.cuda.empty_cache()

test 2689
mse:0.44987642765045166, mae:0.4546157121658325, rse:0.6369397640228271


---
## Trail 3: PatchTST, Dataset:ETTh1,  Metric: 336

### Set hyperparameters
Set some parameters (Args) for the our experiment like dictionary


In [18]:
args.pred_len = 336 # prediction sequence length
print(f"Dataset: {args.data},  Prediction Length : {args.pred_len}") 
# print(args)

Dataset: ETTh1,  Prediction Length : 336


### Training

In [19]:
Exp = Exp_Main
args.pred_len = 3
setting=f'PatchTST_train_on_{args.data}_{args.pred_len}'
# set experiments
exp = Exp(args)
exp.train(setting)

Use GPU: cuda:0
train 8302
val 2878
test 2878
	iters: 100, epoch: 1 | loss: 0.3344718
	speed: 0.0694s/iter; left time: 1789.7446s
	iters: 200, epoch: 1 | loss: 0.3247386
	speed: 0.0660s/iter; left time: 1697.1177s
Epoch: 1 cost time: 17.75670599937439
Epoch: 1, Steps: 259 | Train Loss: 0.4408747 Vali Loss: 0.3721916 Test Loss: 0.3404652
Validation loss decreased (inf --> 0.372192).  Saving model ...
Updating learning rate to 0.005
	iters: 100, epoch: 2 | loss: 0.2608819
	speed: 0.1612s/iter; left time: 4118.5023s
	iters: 200, epoch: 2 | loss: 0.2351755
	speed: 0.0708s/iter; left time: 1801.2028s
Epoch: 2 cost time: 17.503792762756348
Epoch: 2, Steps: 259 | Train Loss: 0.2291536 Vali Loss: 0.2295149 Test Loss: 0.2050908
Validation loss decreased (0.372192 --> 0.229515).  Saving model ...
Updating learning rate to 0.0025
	iters: 100, epoch: 3 | loss: 0.1918993
	speed: 0.1670s/iter; left time: 4222.5122s
	iters: 200, epoch: 3 | loss: 0.1933122
	speed: 0.0689s/iter; left time: 1736.1021s
E

Model(
  (model): PatchTST_backbone(
    (backbone): TSTiEncoder(
      (W_P): Linear(in_features=16, out_features=16, bias=True)
      (dropout): Dropout(p=0.3, inplace=False)
      (encoder): TSTEncoder(
        (layers): ModuleList(
          (0-2): 3 x TSTEncoderLayer(
            (self_attn): _MultiheadAttention(
              (W_Q): Linear(in_features=16, out_features=16, bias=True)
              (W_K): Linear(in_features=16, out_features=16, bias=True)
              (W_V): Linear(in_features=16, out_features=16, bias=True)
              (sdp_attn): _ScaledDotProductAttention(
                (attn_dropout): Dropout(p=0.0, inplace=False)
              )
              (to_out): Sequential(
                (0): Linear(in_features=16, out_features=16, bias=True)
                (1): Dropout(p=0.3, inplace=False)
              )
            )
            (dropout_attn): Dropout(p=0.3, inplace=False)
            (norm_attn): Sequential(
              (0): Transpose()
              (1)

### Testing

In [20]:
exp.test(setting)
torch.cuda.empty_cache()

test 2878
mse:0.17235438525676727, mae:0.27333182096481323, rse:0.39380258321762085


---
## Trail 4: PatchTST, Dataset:ETTh1,  Metric: 720

### Set hyperparameters
Set some parameters (Args) for the our experiment like dictionary


In [21]:
args.pred_len = 720 # prediction sequence length
print(f"Dataset: {args.data},  Prediction Length : {args.pred_len}") 
# print(args)

Dataset: ETTh1,  Prediction Length : 720


### Training

In [22]:
Exp = Exp_Main
args.pred_len = 4
setting=f'PatchTST_train_on_{args.data}_{args.pred_len}'
# set experiments
exp = Exp(args)
exp.train(setting)

Use GPU: cuda:0
train 8301
val 2877
test 2877
	iters: 100, epoch: 1 | loss: 0.5212657
	speed: 0.0661s/iter; left time: 1706.5893s
	iters: 200, epoch: 1 | loss: 0.3030730
	speed: 0.0708s/iter; left time: 1820.6244s
Epoch: 1 cost time: 17.715425729751587
Epoch: 1, Steps: 259 | Train Loss: 0.4712917 Vali Loss: 0.3448377 Test Loss: 0.3140948
Validation loss decreased (inf --> 0.344838).  Saving model ...
Updating learning rate to 0.005
	iters: 100, epoch: 2 | loss: 0.1874618
	speed: 0.1680s/iter; left time: 4291.2789s
	iters: 200, epoch: 2 | loss: 0.2577136
	speed: 0.0650s/iter; left time: 1653.7612s
Epoch: 2 cost time: 17.31331515312195
Epoch: 2, Steps: 259 | Train Loss: 0.2389915 Vali Loss: 0.2476892 Test Loss: 0.2330408
Validation loss decreased (0.344838 --> 0.247689).  Saving model ...
Updating learning rate to 0.0025
	iters: 100, epoch: 3 | loss: 0.2026190
	speed: 0.1532s/iter; left time: 3872.1060s
	iters: 200, epoch: 3 | loss: 0.1800532
	speed: 0.0689s/iter; left time: 1734.6719s
E

Model(
  (model): PatchTST_backbone(
    (backbone): TSTiEncoder(
      (W_P): Linear(in_features=16, out_features=16, bias=True)
      (dropout): Dropout(p=0.3, inplace=False)
      (encoder): TSTEncoder(
        (layers): ModuleList(
          (0-2): 3 x TSTEncoderLayer(
            (self_attn): _MultiheadAttention(
              (W_Q): Linear(in_features=16, out_features=16, bias=True)
              (W_K): Linear(in_features=16, out_features=16, bias=True)
              (W_V): Linear(in_features=16, out_features=16, bias=True)
              (sdp_attn): _ScaledDotProductAttention(
                (attn_dropout): Dropout(p=0.0, inplace=False)
              )
              (to_out): Sequential(
                (0): Linear(in_features=16, out_features=16, bias=True)
                (1): Dropout(p=0.3, inplace=False)
              )
            )
            (dropout_attn): Dropout(p=0.3, inplace=False)
            (norm_attn): Sequential(
              (0): Transpose()
              (1)

### Testing

In [23]:
exp.test(setting)
torch.cuda.empty_cache()

test 2877
mse:0.19844816625118256, mae:0.2914038896560669, rse:0.42253634333610535


### Conclusion
The training process is progressing well and the `PatchTST` model is being optimized in an effective manner. 
- The loss is steadily decreasing over the epochs, which indicates the model is learning and improving. 
- The validation loss is also decreasing at each epoch, showing the model is generalizing well.
- The training speed (secs/iter) improves over time, likely due to optimizations in the model and dropout of neurons. This indicates the model is becoming more efficient as it trains.

**The key positive signs I see are:**

- Decreasing loss and validation loss
- Improving training speed over time
- Learning rate decay schedule


---
# Working on ETTh2 Dataset
---

## Trail 1: PatchTST, Dataset:ETTh2,  Metric: 96
### Set hyperparameters
Set some parameters (Args) for the our experiment like dictionary


In [42]:
args.data_path = 'ETTh2.csv' # data file
args.data = 'ETTh2'  # data
args.pred_len = 96 # prediction sequence length

print(f"Dataset: {args.data},  Prediction Length : {args.pred_len}") 
# print(args)

Dataset: ETTh2,  Prediction Length : 96


### Training

In [25]:
Exp = Exp_Main
setting=f'PatchTST_train_on_{args.data}_{args.pred_len}'
# set experiments
exp = Exp(args)
exp.train(setting)

Use GPU: cuda:0
train 8209
val 2785
test 2785
	iters: 100, epoch: 1 | loss: 0.4108657
	speed: 0.0710s/iter; left time: 1809.9244s
	iters: 200, epoch: 1 | loss: 0.3111556
	speed: 0.0652s/iter; left time: 1656.1540s
Epoch: 1 cost time: 17.398212432861328
Epoch: 1, Steps: 256 | Train Loss: 0.5997753 Vali Loss: 0.2730317 Test Loss: 0.3700201
Validation loss decreased (inf --> 0.273032).  Saving model ...
Updating learning rate to 0.005
	iters: 100, epoch: 2 | loss: 0.5012991
	speed: 0.1698s/iter; left time: 4286.6351s
	iters: 200, epoch: 2 | loss: 0.5826704
	speed: 0.0661s/iter; left time: 1661.5996s
Epoch: 2 cost time: 17.30525517463684
Epoch: 2, Steps: 256 | Train Loss: 0.4644897 Vali Loss: 0.2412224 Test Loss: 0.3228672
Validation loss decreased (0.273032 --> 0.241222).  Saving model ...
Updating learning rate to 0.0025
	iters: 100, epoch: 3 | loss: 0.2503873
	speed: 0.1681s/iter; left time: 4200.1778s
	iters: 200, epoch: 3 | loss: 0.3579534
	speed: 0.0699s/iter; left time: 1740.4312s
E

Model(
  (model): PatchTST_backbone(
    (backbone): TSTiEncoder(
      (W_P): Linear(in_features=16, out_features=16, bias=True)
      (dropout): Dropout(p=0.3, inplace=False)
      (encoder): TSTEncoder(
        (layers): ModuleList(
          (0-2): 3 x TSTEncoderLayer(
            (self_attn): _MultiheadAttention(
              (W_Q): Linear(in_features=16, out_features=16, bias=True)
              (W_K): Linear(in_features=16, out_features=16, bias=True)
              (W_V): Linear(in_features=16, out_features=16, bias=True)
              (sdp_attn): _ScaledDotProductAttention(
                (attn_dropout): Dropout(p=0.0, inplace=False)
              )
              (to_out): Sequential(
                (0): Linear(in_features=16, out_features=16, bias=True)
                (1): Dropout(p=0.3, inplace=False)
              )
            )
            (dropout_attn): Dropout(p=0.3, inplace=False)
            (norm_attn): Sequential(
              (0): Transpose()
              (1)

### Testing

In [26]:
exp.test(setting)
torch.cuda.empty_cache()

test 2785
mse:0.3228672444820404, mae:0.3903789520263672, rse:0.4578811526298523


---
## Trail 2: PatchTST,  Metric: 192
### Set hyperparameters
Set some parameters (Args) for the our experiment like dictionary

In [27]:
args.pred_len = 192 # prediction sequence length
print(f"Dataset: {args.data},  Prediction Length : {args.pred_len}") 
# print(args)

Dataset: ETTh2,  Prediction Length : 192


### Training

In [28]:
Exp = Exp_Main
setting=f'PatchTST_train_on_{args.data}_{args.pred_len}'
# set experiments
exp = Exp(args)
exp.train(setting)

Use GPU: cuda:0
train 8113
val 2689
test 2689
	iters: 100, epoch: 1 | loss: 0.5860826
	speed: 0.0696s/iter; left time: 1753.4439s
	iters: 200, epoch: 1 | loss: 0.5593444
	speed: 0.0772s/iter; left time: 1937.7453s
Epoch: 1 cost time: 17.95907950401306
Epoch: 1, Steps: 253 | Train Loss: 0.6732758 Vali Loss: 0.3943664 Test Loss: 0.5854144
Validation loss decreased (inf --> 0.394366).  Saving model ...
Updating learning rate to 0.005
	iters: 100, epoch: 2 | loss: 0.6403229
	speed: 0.1818s/iter; left time: 4534.9218s
	iters: 200, epoch: 2 | loss: 0.3194021
	speed: 0.0781s/iter; left time: 1941.4232s
Epoch: 2 cost time: 18.829684495925903
Epoch: 2, Steps: 253 | Train Loss: 0.5332889 Vali Loss: 0.3651311 Test Loss: 0.4719868
Validation loss decreased (0.394366 --> 0.365131).  Saving model ...
Updating learning rate to 0.0025
	iters: 100, epoch: 3 | loss: 0.5341831
	speed: 0.1541s/iter; left time: 3804.8737s
	iters: 200, epoch: 3 | loss: 0.2848637
	speed: 0.0768s/iter; left time: 1888.4022s
E

Model(
  (model): PatchTST_backbone(
    (backbone): TSTiEncoder(
      (W_P): Linear(in_features=16, out_features=16, bias=True)
      (dropout): Dropout(p=0.3, inplace=False)
      (encoder): TSTEncoder(
        (layers): ModuleList(
          (0-2): 3 x TSTEncoderLayer(
            (self_attn): _MultiheadAttention(
              (W_Q): Linear(in_features=16, out_features=16, bias=True)
              (W_K): Linear(in_features=16, out_features=16, bias=True)
              (W_V): Linear(in_features=16, out_features=16, bias=True)
              (sdp_attn): _ScaledDotProductAttention(
                (attn_dropout): Dropout(p=0.0, inplace=False)
              )
              (to_out): Sequential(
                (0): Linear(in_features=16, out_features=16, bias=True)
                (1): Dropout(p=0.3, inplace=False)
              )
            )
            (dropout_attn): Dropout(p=0.3, inplace=False)
            (norm_attn): Sequential(
              (0): Transpose()
              (1)

### Testing

In [29]:
exp.test(setting)
torch.cuda.empty_cache()

test 2689
mse:0.41376355290412903, mae:0.43530601263046265, rse:0.5157955884933472


---
## Trail 3: PatchTST, Dataset:ETTh2,  Metric: 336

### Set hyperparameters
Set some parameters (Args) for the our experiment like dictionary


In [30]:
args.pred_len = 336 # prediction sequence length
print(f"Dataset: {args.data},  Prediction Length : {args.pred_len}") 
# print(args)

Dataset: ETTh2,  Prediction Length : 336


### Training

In [31]:
Exp = Exp_Main
setting=f'PatchTST_train_on_{args.data}_{args.pred_len}'
# set experiments
exp = Exp(args)
exp.train(setting)

Use GPU: cuda:0
train 7969
val 2545
test 2545
	iters: 100, epoch: 1 | loss: 0.4255295
	speed: 0.0736s/iter; left time: 1825.5453s
	iters: 200, epoch: 1 | loss: 0.8666487
	speed: 0.0592s/iter; left time: 1462.5349s
Epoch: 1 cost time: 16.672760009765625
Epoch: 1, Steps: 249 | Train Loss: 0.7144450 Vali Loss: 0.4767845 Test Loss: 0.5395549
Validation loss decreased (inf --> 0.476785).  Saving model ...
Updating learning rate to 0.005
	iters: 100, epoch: 2 | loss: 0.5763573
	speed: 0.1889s/iter; left time: 4638.8387s
	iters: 200, epoch: 2 | loss: 0.5859736
	speed: 0.0609s/iter; left time: 1488.1450s
Epoch: 2 cost time: 17.194735288619995
Epoch: 2, Steps: 249 | Train Loss: 0.6155907 Vali Loss: 0.4994032 Test Loss: 0.6496858
EarlyStopping counter: 1 out of 3
Updating learning rate to 0.0025
	iters: 100, epoch: 3 | loss: 0.6061680
	speed: 0.1642s/iter; left time: 3989.3705s
	iters: 200, epoch: 3 | loss: 0.7482429
	speed: 0.0700s/iter; left time: 1694.9301s
Epoch: 3 cost time: 17.705214262008

Model(
  (model): PatchTST_backbone(
    (backbone): TSTiEncoder(
      (W_P): Linear(in_features=16, out_features=16, bias=True)
      (dropout): Dropout(p=0.3, inplace=False)
      (encoder): TSTEncoder(
        (layers): ModuleList(
          (0-2): 3 x TSTEncoderLayer(
            (self_attn): _MultiheadAttention(
              (W_Q): Linear(in_features=16, out_features=16, bias=True)
              (W_K): Linear(in_features=16, out_features=16, bias=True)
              (W_V): Linear(in_features=16, out_features=16, bias=True)
              (sdp_attn): _ScaledDotProductAttention(
                (attn_dropout): Dropout(p=0.0, inplace=False)
              )
              (to_out): Sequential(
                (0): Linear(in_features=16, out_features=16, bias=True)
                (1): Dropout(p=0.3, inplace=False)
              )
            )
            (dropout_attn): Dropout(p=0.3, inplace=False)
            (norm_attn): Sequential(
              (0): Transpose()
              (1)

### Testing

In [32]:
exp.test(setting)
torch.cuda.empty_cache()

test 2545
mse:0.539555013179779, mae:0.5184375643730164, rse:0.5870471000671387


---
## Trail 4: PatchTST, Dataset:ETTh2,  Metric: 720

### Set hyperparameters
Set some parameters (Args) for the our experiment like dictionary


In [33]:
args.pred_len = 720 # prediction sequence length
print(f"Dataset: {args.data},  Prediction Length : {args.pred_len}") 
# print(args)

Dataset: ETTh2,  Prediction Length : 720


### Training

In [34]:
Exp = Exp_Main
setting=f'PatchTST_train_on_{args.data}_{args.pred_len}'
# set experiments
exp = Exp(args)
exp.train(setting)

Use GPU: cuda:0
train 7585
val 2161
test 2161
	iters: 100, epoch: 1 | loss: 0.7683773
	speed: 0.0726s/iter; left time: 1714.1322s
	iters: 200, epoch: 1 | loss: 0.7943283
	speed: 0.0760s/iter; left time: 1785.6630s
Epoch: 1 cost time: 17.07037663459778
Epoch: 1, Steps: 237 | Train Loss: 0.8332328 Vali Loss: 0.6838762 Test Loss: 0.7464288
Validation loss decreased (inf --> 0.683876).  Saving model ...
Updating learning rate to 0.005
	iters: 100, epoch: 2 | loss: 0.8648713
	speed: 0.1650s/iter; left time: 3855.5516s
	iters: 200, epoch: 2 | loss: 0.6552268
	speed: 0.0740s/iter; left time: 1722.4079s
Epoch: 2 cost time: 17.992148399353027
Epoch: 2, Steps: 237 | Train Loss: 0.7202576 Vali Loss: 0.6711750 Test Loss: 0.7515456
Validation loss decreased (0.683876 --> 0.671175).  Saving model ...
Updating learning rate to 0.0025
	iters: 100, epoch: 3 | loss: 0.6566624
	speed: 0.1550s/iter; left time: 3584.3560s
	iters: 200, epoch: 3 | loss: 0.5715491
	speed: 0.0791s/iter; left time: 1821.2650s
E

Model(
  (model): PatchTST_backbone(
    (backbone): TSTiEncoder(
      (W_P): Linear(in_features=16, out_features=16, bias=True)
      (dropout): Dropout(p=0.3, inplace=False)
      (encoder): TSTEncoder(
        (layers): ModuleList(
          (0-2): 3 x TSTEncoderLayer(
            (self_attn): _MultiheadAttention(
              (W_Q): Linear(in_features=16, out_features=16, bias=True)
              (W_K): Linear(in_features=16, out_features=16, bias=True)
              (W_V): Linear(in_features=16, out_features=16, bias=True)
              (sdp_attn): _ScaledDotProductAttention(
                (attn_dropout): Dropout(p=0.0, inplace=False)
              )
              (to_out): Sequential(
                (0): Linear(in_features=16, out_features=16, bias=True)
                (1): Dropout(p=0.3, inplace=False)
              )
            )
            (dropout_attn): Dropout(p=0.3, inplace=False)
            (norm_attn): Sequential(
              (0): Transpose()
              (1)

### Testing

In [35]:
exp.test(setting)
torch.cuda.empty_cache()

test 2161
mse:0.7515456080436707, mae:0.6049982309341431, rse:0.6930935382843018


---
# Working on ETTm1 Dataset
---

## Trail 1: PatchTST, Dataset:ETTm1,  Metric: 96
### Set hyperparameters
Set some parameters (Args) for the our experiment like dictionary


In [43]:
args.data_path = 'ETTm1.csv' # data file
args.data = 'ETTm1'  # data
args.pred_len = 96 # prediction sequence length
args.n_heads = 16 
args.d_model = 128 
args.d_ff = 256 
args.dropout = 0.2
args.fc_dropout = 0.2
args.patience = 20
args.lradj = 'TST'
args.pct_start = 0.4

print(f"Dataset: {args.data},  Prediction Length : {args.pred_len}") 
# print(args)

Dataset: ETTm1,  Prediction Length : 96


### Training

In [44]:
Exp = Exp_Main
setting=f'PatchTST_train_on_{args.data}_{args.pred_len}'
# set experiments
exp = Exp(args)
exp.train(setting)

Use GPU: cuda:0
train 34129
val 11425
test 11425
	iters: 100, epoch: 1 | loss: 0.3275522
	speed: 0.0736s/iter; left time: 7834.3638s
	iters: 200, epoch: 1 | loss: 0.2828250
	speed: 0.0780s/iter; left time: 8304.0682s
	iters: 300, epoch: 1 | loss: 0.2911699
	speed: 0.0772s/iter; left time: 8203.4025s
	iters: 400, epoch: 1 | loss: 0.3137389
	speed: 0.0748s/iter; left time: 7947.4658s
	iters: 500, epoch: 1 | loss: 0.2675519
	speed: 0.0760s/iter; left time: 8060.7817s
	iters: 600, epoch: 1 | loss: 0.2635594
	speed: 0.0802s/iter; left time: 8500.1411s
	iters: 700, epoch: 1 | loss: 0.2640387
	speed: 0.0719s/iter; left time: 7613.3111s
	iters: 800, epoch: 1 | loss: 0.3039788
	speed: 0.0731s/iter; left time: 7735.0784s
	iters: 900, epoch: 1 | loss: 0.3141021
	speed: 0.0739s/iter; left time: 7814.4045s
	iters: 1000, epoch: 1 | loss: 0.3125711
	speed: 0.0781s/iter; left time: 8248.5720s
Epoch: 1 cost time: 80.67248821258545
Epoch: 1, Steps: 1066 | Train Loss: 0.3115152 Vali Loss: 0.4127135 Test 

Model(
  (model): PatchTST_backbone(
    (backbone): TSTiEncoder(
      (W_P): Linear(in_features=16, out_features=128, bias=True)
      (dropout): Dropout(p=0.2, inplace=False)
      (encoder): TSTEncoder(
        (layers): ModuleList(
          (0-2): 3 x TSTEncoderLayer(
            (self_attn): _MultiheadAttention(
              (W_Q): Linear(in_features=128, out_features=128, bias=True)
              (W_K): Linear(in_features=128, out_features=128, bias=True)
              (W_V): Linear(in_features=128, out_features=128, bias=True)
              (sdp_attn): _ScaledDotProductAttention(
                (attn_dropout): Dropout(p=0.0, inplace=False)
              )
              (to_out): Sequential(
                (0): Linear(in_features=128, out_features=128, bias=True)
                (1): Dropout(p=0.2, inplace=False)
              )
            )
            (dropout_attn): Dropout(p=0.2, inplace=False)
            (norm_attn): Sequential(
              (0): Transpose()
        

### Testing

In [45]:
exp.test(setting)
torch.cuda.empty_cache()

test 11425
mse:0.3276263177394867, mae:0.37977054715156555, rse:0.5446505546569824


---
## Trail 2: PatchTST, Dataset:ETTm1 , Metric: 192
### Set hyperparameters
Set some parameters (Args) for the our experiment like dictionary

In [46]:
args.pred_len = 192 # prediction sequence length

print(f"Dataset: {args.data},  Prediction Length : {args.pred_len}") 
# print(args)

Dataset: ETTm1,  Prediction Length : 192


### Training

In [47]:
Exp = Exp_Main
setting=f'PatchTST_train_on_{args.data}_{args.pred_len}'
# set experiments
exp = Exp(args)
exp.train(setting)

Use GPU: cuda:0
train 34033
val 11329
test 11329
	iters: 100, epoch: 1 | loss: 0.3568006
	speed: 0.0786s/iter; left time: 8344.0066s
	iters: 200, epoch: 1 | loss: 0.3742522
	speed: 0.0761s/iter; left time: 8072.1618s
	iters: 300, epoch: 1 | loss: 0.3438635
	speed: 0.0757s/iter; left time: 8029.4292s
	iters: 400, epoch: 1 | loss: 0.3249428
	speed: 0.0731s/iter; left time: 7738.0330s
	iters: 500, epoch: 1 | loss: 0.3478993
	speed: 0.0720s/iter; left time: 7613.1840s
	iters: 600, epoch: 1 | loss: 0.3752334
	speed: 0.0811s/iter; left time: 8576.8257s
	iters: 700, epoch: 1 | loss: 0.3527400
	speed: 0.0789s/iter; left time: 8330.3545s
	iters: 800, epoch: 1 | loss: 0.3734523
	speed: 0.0811s/iter; left time: 8552.4106s
	iters: 900, epoch: 1 | loss: 0.2887919
	speed: 0.0719s/iter; left time: 7576.1824s
	iters: 1000, epoch: 1 | loss: 0.3661746
	speed: 0.0809s/iter; left time: 8523.2654s
Epoch: 1 cost time: 81.9584219455719
Epoch: 1, Steps: 1063 | Train Loss: 0.3471995 Vali Loss: 0.5104088 Test L

Model(
  (model): PatchTST_backbone(
    (backbone): TSTiEncoder(
      (W_P): Linear(in_features=16, out_features=128, bias=True)
      (dropout): Dropout(p=0.2, inplace=False)
      (encoder): TSTEncoder(
        (layers): ModuleList(
          (0-2): 3 x TSTEncoderLayer(
            (self_attn): _MultiheadAttention(
              (W_Q): Linear(in_features=128, out_features=128, bias=True)
              (W_K): Linear(in_features=128, out_features=128, bias=True)
              (W_V): Linear(in_features=128, out_features=128, bias=True)
              (sdp_attn): _ScaledDotProductAttention(
                (attn_dropout): Dropout(p=0.0, inplace=False)
              )
              (to_out): Sequential(
                (0): Linear(in_features=128, out_features=128, bias=True)
                (1): Dropout(p=0.2, inplace=False)
              )
            )
            (dropout_attn): Dropout(p=0.2, inplace=False)
            (norm_attn): Sequential(
              (0): Transpose()
        

### Testing

In [48]:
exp.test(setting)
torch.cuda.empty_cache()

test 11329
mse:0.36035341024398804, mae:0.3956582248210907, rse:0.571427583694458


---
## Trail 3: PatchTST, Dataset:ETTm1,  Metric: 336

### Set hyperparameters
Set some parameters (Args) for the our experiment like dictionary


In [86]:
args.pred_len = 336 # prediction sequence length
print(f"Dataset: {args.data},  Prediction Length : {args.pred_len}") 
# print(args)

Dataset: ETTm2,  Prediction Length : 336


### Training

In [87]:
Exp = Exp_Main
setting=f'PatchTST_train_on_{args.data}_{args.pred_len}'
# set experiments
exp = Exp(args)
exp.train(setting)

Use GPU: cuda:0
train 33889
val 11185
test 11185
	iters: 100, epoch: 1 | loss: 0.3282336
	speed: 0.0783s/iter; left time: 8281.5136s
	iters: 200, epoch: 1 | loss: 0.2279086
	speed: 0.0801s/iter; left time: 8462.1668s
	iters: 300, epoch: 1 | loss: 0.4095321
	speed: 0.0790s/iter; left time: 8342.7958s
	iters: 400, epoch: 1 | loss: 0.4098197
	speed: 0.0813s/iter; left time: 8571.9696s
	iters: 500, epoch: 1 | loss: 0.2569183
	speed: 0.0800s/iter; left time: 8427.6906s
	iters: 600, epoch: 1 | loss: 0.2969300
	speed: 0.0878s/iter; left time: 9243.4873s
	iters: 700, epoch: 1 | loss: 0.5525036
	speed: 0.0819s/iter; left time: 8620.2352s
	iters: 800, epoch: 1 | loss: 0.4619457
	speed: 0.0781s/iter; left time: 8206.4157s
	iters: 900, epoch: 1 | loss: 0.3152940
	speed: 0.0861s/iter; left time: 9039.8930s
	iters: 1000, epoch: 1 | loss: 0.3613250
	speed: 0.0819s/iter; left time: 8590.5206s
Epoch: 1 cost time: 86.13871335983276
Epoch: 1, Steps: 1059 | Train Loss: 0.3985891 Vali Loss: 0.2208271 Test 

Model(
  (model): PatchTST_backbone(
    (backbone): TSTiEncoder(
      (W_P): Linear(in_features=16, out_features=128, bias=True)
      (dropout): Dropout(p=0.2, inplace=False)
      (encoder): TSTEncoder(
        (layers): ModuleList(
          (0-2): 3 x TSTEncoderLayer(
            (self_attn): _MultiheadAttention(
              (W_Q): Linear(in_features=128, out_features=128, bias=True)
              (W_K): Linear(in_features=128, out_features=128, bias=True)
              (W_V): Linear(in_features=128, out_features=128, bias=True)
              (sdp_attn): _ScaledDotProductAttention(
                (attn_dropout): Dropout(p=0.0, inplace=False)
              )
              (to_out): Sequential(
                (0): Linear(in_features=128, out_features=128, bias=True)
                (1): Dropout(p=0.2, inplace=False)
              )
            )
            (dropout_attn): Dropout(p=0.2, inplace=False)
            (norm_attn): Sequential(
              (0): Transpose()
        

### Testing

In [88]:
exp.test(setting)
torch.cuda.empty_cache()

test 11185
mse:0.29942047595977783, mae:0.35299360752105713, rse:0.44180169701576233


---
## Trail 4: PatchTST, Dataset:ETTm1,  Metric: 720

### Set hyperparameters
Set some parameters (Args) for the our experiment like dictionary


In [89]:
args.pred_len = 720 # prediction sequence length
print(f"Dataset: {args.data},  Prediction Length : {args.pred_len}") 
# print(args)

Dataset: ETTm2,  Prediction Length : 720


### Training

In [90]:
Exp = Exp_Main
setting=f'PatchTST_train_on_{args.data}_{args.pred_len}'
# set experiments
exp = Exp(args)
exp.train(setting)

Use GPU: cuda:0
train 33505
val 10801
test 10801
	iters: 100, epoch: 1 | loss: 0.5727623
	speed: 0.0841s/iter; left time: 8800.4818s
	iters: 200, epoch: 1 | loss: 0.3959309
	speed: 0.0908s/iter; left time: 9492.0281s
	iters: 300, epoch: 1 | loss: 0.4451066
	speed: 0.0772s/iter; left time: 8059.9810s
	iters: 400, epoch: 1 | loss: 0.4637458
	speed: 0.0878s/iter; left time: 9161.6836s
	iters: 500, epoch: 1 | loss: 0.6901796
	speed: 0.0839s/iter; left time: 8742.3562s
	iters: 600, epoch: 1 | loss: 0.6514398
	speed: 0.0910s/iter; left time: 9474.4076s
	iters: 700, epoch: 1 | loss: 0.6205701
	speed: 0.0861s/iter; left time: 8955.4781s
	iters: 800, epoch: 1 | loss: 0.3094168
	speed: 0.0850s/iter; left time: 8828.5576s
	iters: 900, epoch: 1 | loss: 0.3135817
	speed: 0.0810s/iter; left time: 8404.1996s
	iters: 1000, epoch: 1 | loss: 0.2895051
	speed: 0.0860s/iter; left time: 8919.2385s
Epoch: 1 cost time: 89.50925374031067
Epoch: 1, Steps: 1047 | Train Loss: 0.5030201 Vali Loss: 0.3659404 Test 

Model(
  (model): PatchTST_backbone(
    (backbone): TSTiEncoder(
      (W_P): Linear(in_features=16, out_features=128, bias=True)
      (dropout): Dropout(p=0.2, inplace=False)
      (encoder): TSTEncoder(
        (layers): ModuleList(
          (0-2): 3 x TSTEncoderLayer(
            (self_attn): _MultiheadAttention(
              (W_Q): Linear(in_features=128, out_features=128, bias=True)
              (W_K): Linear(in_features=128, out_features=128, bias=True)
              (W_V): Linear(in_features=128, out_features=128, bias=True)
              (sdp_attn): _ScaledDotProductAttention(
                (attn_dropout): Dropout(p=0.0, inplace=False)
              )
              (to_out): Sequential(
                (0): Linear(in_features=128, out_features=128, bias=True)
                (1): Dropout(p=0.2, inplace=False)
              )
            )
            (dropout_attn): Dropout(p=0.2, inplace=False)
            (norm_attn): Sequential(
              (0): Transpose()
        

### Testing

In [91]:
exp.test(setting)
torch.cuda.empty_cache()

test 10801
mse:0.4332469403743744, mae:0.4294425845146179, rse:0.528854250907898


---
# Working on ETTm2 Dataset
---

## Trail 1: PatchTST, Dataset:ETTm2,  Metric: 96
### Set hyperparameters
Set some parameters (Args) for the our experiment like dictionary


In [92]:
args.data_path = 'ETTm2.csv' # data file
args.data = 'ETTm2'  # data
args.pred_len = 96 # prediction sequence length
print(f"Dataset: {args.data},  Prediction Length : {args.pred_len}") 
# print(args)

Dataset: ETTm2,  Prediction Length : 96


### Training

In [93]:
Exp = Exp_Main
setting=f'PatchTST_train_on_{args.data}_{args.pred_len}'
# set experiments
exp = Exp(args)
exp.train(setting)

Use GPU: cuda:0
train 34129
val 11425
test 11425
	iters: 100, epoch: 1 | loss: 0.2986088
	speed: 0.0775s/iter; left time: 8252.6501s
	iters: 200, epoch: 1 | loss: 0.2189520
	speed: 0.0761s/iter; left time: 8093.1558s
	iters: 300, epoch: 1 | loss: 0.2698198
	speed: 0.0727s/iter; left time: 7728.7875s
	iters: 400, epoch: 1 | loss: 0.5238720
	speed: 0.0830s/iter; left time: 8813.3957s
	iters: 500, epoch: 1 | loss: 0.2177546
	speed: 0.0730s/iter; left time: 7746.8619s
	iters: 600, epoch: 1 | loss: 0.2002172
	speed: 0.0753s/iter; left time: 7982.5597s
	iters: 700, epoch: 1 | loss: 0.2478255
	speed: 0.0777s/iter; left time: 8231.2701s
	iters: 800, epoch: 1 | loss: 0.3425599
	speed: 0.0781s/iter; left time: 8262.8083s
	iters: 900, epoch: 1 | loss: 0.3695400
	speed: 0.0839s/iter; left time: 8872.0942s
	iters: 1000, epoch: 1 | loss: 0.1490627
	speed: 0.0749s/iter; left time: 7914.3217s
Epoch: 1 cost time: 82.24881148338318
Epoch: 1, Steps: 1066 | Train Loss: 0.2849889 Vali Loss: 0.2099577 Test 

Model(
  (model): PatchTST_backbone(
    (backbone): TSTiEncoder(
      (W_P): Linear(in_features=16, out_features=128, bias=True)
      (dropout): Dropout(p=0.2, inplace=False)
      (encoder): TSTEncoder(
        (layers): ModuleList(
          (0-2): 3 x TSTEncoderLayer(
            (self_attn): _MultiheadAttention(
              (W_Q): Linear(in_features=128, out_features=128, bias=True)
              (W_K): Linear(in_features=128, out_features=128, bias=True)
              (W_V): Linear(in_features=128, out_features=128, bias=True)
              (sdp_attn): _ScaledDotProductAttention(
                (attn_dropout): Dropout(p=0.0, inplace=False)
              )
              (to_out): Sequential(
                (0): Linear(in_features=128, out_features=128, bias=True)
                (1): Dropout(p=0.2, inplace=False)
              )
            )
            (dropout_attn): Dropout(p=0.2, inplace=False)
            (norm_attn): Sequential(
              (0): Transpose()
        

### Testing

In [94]:
exp.test(setting)
torch.cuda.empty_cache()

test 11425
mse:0.19508586823940277, mae:0.2823296785354614, rse:0.3580951392650604


---
## Trail 2: PatchTST, Dataset:ETTm2,  Metric: 192
### Set hyperparameters
Set some parameters (Args) for the our experiment like dictionary

In [95]:
args.pred_len = 192 # prediction sequence length
print(f"Dataset: {args.data},  Prediction Length : {args.pred_len}") 

Dataset: ETTm2,  Prediction Length : 192


### Training

In [96]:
Exp = Exp_Main
setting=f'PatchTST_train_on_{args.data}_{args.pred_len}'
# set experiments
exp = Exp(args)
exp.train(setting)

Use GPU: cuda:0
train 34033
val 11329
test 11329
	iters: 100, epoch: 1 | loss: 0.3124763
	speed: 0.0878s/iter; left time: 9321.1883s
	iters: 200, epoch: 1 | loss: 0.3796742
	speed: 0.0962s/iter; left time: 10207.2771s
	iters: 300, epoch: 1 | loss: 0.3381073
	speed: 0.0900s/iter; left time: 9538.5721s
	iters: 400, epoch: 1 | loss: 0.4121913
	speed: 0.0940s/iter; left time: 9956.7004s
	iters: 500, epoch: 1 | loss: 0.3012684
	speed: 0.0931s/iter; left time: 9846.1644s
	iters: 600, epoch: 1 | loss: 0.2452905
	speed: 0.0860s/iter; left time: 9093.2682s
	iters: 700, epoch: 1 | loss: 0.3639145
	speed: 0.0910s/iter; left time: 9606.1875s
	iters: 800, epoch: 1 | loss: 0.2285268
	speed: 0.0852s/iter; left time: 8986.0999s
	iters: 900, epoch: 1 | loss: 0.2316541
	speed: 0.0889s/iter; left time: 9366.8485s
	iters: 1000, epoch: 1 | loss: 0.3153989
	speed: 0.0930s/iter; left time: 9797.8035s
Epoch: 1 cost time: 96.19215726852417
Epoch: 1, Steps: 1063 | Train Loss: 0.3403623 Vali Loss: 0.1967488 Test

Model(
  (model): PatchTST_backbone(
    (backbone): TSTiEncoder(
      (W_P): Linear(in_features=16, out_features=128, bias=True)
      (dropout): Dropout(p=0.2, inplace=False)
      (encoder): TSTEncoder(
        (layers): ModuleList(
          (0-2): 3 x TSTEncoderLayer(
            (self_attn): _MultiheadAttention(
              (W_Q): Linear(in_features=128, out_features=128, bias=True)
              (W_K): Linear(in_features=128, out_features=128, bias=True)
              (W_V): Linear(in_features=128, out_features=128, bias=True)
              (sdp_attn): _ScaledDotProductAttention(
                (attn_dropout): Dropout(p=0.0, inplace=False)
              )
              (to_out): Sequential(
                (0): Linear(in_features=128, out_features=128, bias=True)
                (1): Dropout(p=0.2, inplace=False)
              )
            )
            (dropout_attn): Dropout(p=0.2, inplace=False)
            (norm_attn): Sequential(
              (0): Transpose()
        

### Testing

In [None]:
exp.test(setting)
torch.cuda.empty_cache()

test 11329
mse:0.24460437893867493, mae:0.3230002522468567, rse:0.40032851696014404


---
## Trail 3: PatchTST, Dataset:ETTm2,  Metric: 336

### Set hyperparameters
Set some parameters (Args) for the our experiment like dictionary


In [104]:
args.pred_len = 336 # prediction sequence length
print(f"Dataset: {args.data},  Prediction Length : {args.pred_len}") 
# print(args)

Dataset: ETTm2,  Prediction Length : 336


### Training

In [None]:
Exp = Exp_Main
setting=f'PatchTST_train_on_{args.data}_{args.pred_len}'
# set experiments
exp = Exp(args)
exp.train(setting)

Use GPU: cuda:0
train 33889
val 11185
test 11185
	iters: 100, epoch: 1 | loss: 0.4401748
	speed: 0.1030s/iter; left time: 10898.1162s
	iters: 200, epoch: 1 | loss: 0.5422932
	speed: 0.0977s/iter; left time: 10327.0890s
	iters: 300, epoch: 1 | loss: 0.3221259
	speed: 0.1112s/iter; left time: 11747.2829s
	iters: 400, epoch: 1 | loss: 0.7222717
	speed: 0.1010s/iter; left time: 10656.1548s
	iters: 500, epoch: 1 | loss: 0.3426677
	speed: 0.1169s/iter; left time: 12325.2431s
	iters: 600, epoch: 1 | loss: 0.3434246
	speed: 0.1098s/iter; left time: 11563.6212s
	iters: 700, epoch: 1 | loss: 0.5051081
	speed: 0.1080s/iter; left time: 11361.8850s
	iters: 800, epoch: 1 | loss: 0.4058499
	speed: 0.0944s/iter; left time: 9919.6070s
	iters: 900, epoch: 1 | loss: 0.4125472
	speed: 0.1078s/iter; left time: 11315.0532s
	iters: 1000, epoch: 1 | loss: 0.4514018
	speed: 0.1049s/iter; left time: 11003.0042s
Epoch: 1 cost time: 112.20184254646301
Epoch: 1, Steps: 1059 | Train Loss: 0.4023900 Vali Loss: 0.245

Model(
  (model): PatchTST_backbone(
    (backbone): TSTiEncoder(
      (W_P): Linear(in_features=16, out_features=128, bias=True)
      (dropout): Dropout(p=0.2, inplace=False)
      (encoder): TSTEncoder(
        (layers): ModuleList(
          (0-2): 3 x TSTEncoderLayer(
            (self_attn): _MultiheadAttention(
              (W_Q): Linear(in_features=128, out_features=128, bias=True)
              (W_K): Linear(in_features=128, out_features=128, bias=True)
              (W_V): Linear(in_features=128, out_features=128, bias=True)
              (sdp_attn): _ScaledDotProductAttention(
                (attn_dropout): Dropout(p=0.0, inplace=False)
              )
              (to_out): Sequential(
                (0): Linear(in_features=128, out_features=128, bias=True)
                (1): Dropout(p=0.2, inplace=False)
              )
            )
            (dropout_attn): Dropout(p=0.2, inplace=False)
            (norm_attn): Sequential(
              (0): Transpose()
        

### Testing

In [None]:
exp.test(setting)
torch.cuda.empty_cache()

test 11185
mse:0.3013668358325958, mae:0.35843971371650696, rse:0.4432353377342224


---
## Trail 4: PatchTST, Dataset:ETTm2,  Metric: 720

### Set hyperparameters
Set some parameters (Args) for the our experiment like dictionary


In [None]:
args.pred_len = 720 # prediction sequence length
print(f"Dataset: {args.data},  Prediction Length : {args.pred_len}") 

Dataset: ETTm2,  Prediction Length : 720


### Training

In [None]:
Exp = Exp_Main
setting=f'PatchTST_train_on_{args.data}_{args.pred_len}'
# set experiments
exp = Exp(args)
exp.train(setting)

Use GPU: cuda:0
train 33505
val 10801
test 10801
	iters: 100, epoch: 1 | loss: 0.2949437
	speed: 0.3523s/iter; left time: 36853.6838s
	iters: 200, epoch: 1 | loss: 0.8657505
	speed: 0.4260s/iter; left time: 44519.8551s
	iters: 300, epoch: 1 | loss: 0.3538330
	speed: 0.4561s/iter; left time: 47612.6423s
	iters: 400, epoch: 1 | loss: 0.3703703
	speed: 0.4670s/iter; left time: 48712.9703s
	iters: 500, epoch: 1 | loss: 0.6584207
	speed: 0.4519s/iter; left time: 47086.6378s
	iters: 600, epoch: 1 | loss: 0.5931085
	speed: 0.3570s/iter; left time: 37162.7061s
	iters: 700, epoch: 1 | loss: 0.3703278
	speed: 0.3511s/iter; left time: 36516.6377s
	iters: 800, epoch: 1 | loss: 0.2711176
	speed: 0.3659s/iter; left time: 38018.1046s
	iters: 900, epoch: 1 | loss: 0.3374104
	speed: 0.3499s/iter; left time: 36324.9557s
	iters: 1000, epoch: 1 | loss: 0.5459972
	speed: 0.3690s/iter; left time: 38269.0294s
Epoch: 1 cost time: 410.51699566841125
Epoch: 1, Steps: 1047 | Train Loss: 0.5027259 Vali Loss: 0.29

Model(
  (model): PatchTST_backbone(
    (backbone): TSTiEncoder(
      (W_P): Linear(in_features=16, out_features=128, bias=True)
      (dropout): Dropout(p=0.2, inplace=False)
      (encoder): TSTEncoder(
        (layers): ModuleList(
          (0-2): 3 x TSTEncoderLayer(
            (self_attn): _MultiheadAttention(
              (W_Q): Linear(in_features=128, out_features=128, bias=True)
              (W_K): Linear(in_features=128, out_features=128, bias=True)
              (W_V): Linear(in_features=128, out_features=128, bias=True)
              (sdp_attn): _ScaledDotProductAttention(
                (attn_dropout): Dropout(p=0.0, inplace=False)
              )
              (to_out): Sequential(
                (0): Linear(in_features=128, out_features=128, bias=True)
                (1): Dropout(p=0.2, inplace=False)
              )
            )
            (dropout_attn): Dropout(p=0.2, inplace=False)
            (norm_attn): Sequential(
              (0): Transpose()
        

### Testing

In [None]:
exp.test(setting)
torch.cuda.empty_cache()

test 10801
mse:0.3930669128894806, mae:0.4152883291244507, rse:0.5037342309951782
