In [1]:
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning import Trainer
from policy import ForwardPolicy, BackwardPolicy
from gflownet.gflownet import GFlowNet
from gflownet.dataset import MatrixDataModule
import itertools
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from gflownet.gflownet import GFlowNet
from gflownet.dataset import MatrixDataModule


def run_experiment(hyperparams):
    matrix_dir = 'data/medium_ILU'
    data_module = MatrixDataModule(matrix_directory=matrix_dir, batch_size=1)

    forward_policy = ForwardPolicy(node_features=hyperparams['node_features'], hidden_dim=hyperparams['hidden_dim'], max_num_actions=hyperparams['max_num_actions'])
    backward_policy = BackwardPolicy(input_dim=hyperparams['input_dim'], hidden_dim=hyperparams['hidden_dim'], max_num_actions=hyperparams['max_num_actions'])

    model = GFlowNet(forward_policy=forward_policy, backward_policy=backward_policy, no_sampling_batch=hyperparams['no_sampling_batch'], lr=hyperparams['lr'], schedule_patience=hyperparams['schedule_patience'])

    logger = TensorBoardLogger("tb_logs", name=f"gflownet_lr_{hyperparams['lr']}_epochs_{hyperparams['number_epoch']}_sampling_{hyperparams['no_sampling_batch']}_patience_{hyperparams['schedule_patience']}")

    callbacks = [
        EarlyStopping(monitor="train_loss", mode="min", patience=10),
        ModelCheckpoint(monitor="train_loss", save_top_k=3, mode="min")
    ]

    trainer = pl.Trainer(max_epochs=hyperparams['number_epoch'], logger=logger, callbacks=callbacks)
    trainer.fit(model, data_module)

if __name__ == '__main__':
    # Define hyperparameters space
    learning_rates = [2e-4, 7e-5, 2e-5]
    number_epochs = [2, 3] #Change to 50, 100 after testing
    no_sampling_batches = [2, 4] #Change to 4, 8, 16 after testing
    schedule_patience = [5, 10] 

    # Create hyperparameter combinations
    hyperparams_combinations = list(itertools.product(learning_rates, number_epochs, no_sampling_batches, schedule_patience))

    # Run experiments for each combination
    for lr, number_epoch, no_sampling_batch, patience in hyperparams_combinations:
        hyperparams = {
            'lr': lr,
            'number_epoch': number_epoch,
            'no_sampling_batch': no_sampling_batch,
            'hidden_dim': 4,
            'node_features': -1,
            'input_dim': 1,
            'max_num_actions': 180000,
            'schedule_patience': patience
        }
        run_experiment(hyperparams)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/tonylizza/opt/anaconda3/envs/ML_new/lib/python3.12/site-packages/pytorch_lightning/trainer/configuration_validator.py:68: You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.
/Users/tonylizza/opt/anaconda3/envs/ML_new/lib/python3.12/site-packages/pytorch_lightning/core/optimizer.py:316: The lr scheduler dict contains the key(s) ['monitor'], but the keys will be ignored. You need to call `lr_scheduler.step()` manually in manual optimization.
/Users/tonylizza/opt/anaconda3/envs/ML_new/lib/python3.12/site-packages/pytorch_lightning/utilities/model_summary/model_summary.py:477: The total number of parameters detected may be inaccurate because the model contains an instance of `UninitializedParameter`. To get an accurate number, set `self.example_input_array` in your LightningModule.

  | Name            | Type           | Params | Mode 
-------

Epoch 0:   0%|          | 0/4 [00:00<?, ?it/s] 

  ilu_indices = torch.tensor([ilu_matrix.row, ilu_matrix.col], dtype=torch.long)
  product = torch.mm(updated_matrix, original_matrix)


Num Actions: 151
tensor([212.9842], dtype=torch.float64)
Num Actions: 151
tensor([405.3284], dtype=torch.float64)
Padded Forward Probs torch.Size([2, 75, 1, 151])
Loss: 114.50151062011719


/Users/tonylizza/opt/anaconda3/envs/ML_new/lib/python3.12/site-packages/pytorch_lightning/utilities/data.py:78: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 39. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.


Epoch 0:  25%|██▌       | 1/4 [00:25<01:16,  0.04it/s, v_num=13]Num Actions: 119
tensor([128.1882], dtype=torch.float64)
Num Actions: 119
tensor([475.9112], dtype=torch.float64)
Padded Forward Probs torch.Size([2, 59, 1, 119])
Loss: 153.8465118408203


/Users/tonylizza/opt/anaconda3/envs/ML_new/lib/python3.12/site-packages/pytorch_lightning/utilities/data.py:78: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 18. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.


Epoch 0:  50%|█████     | 2/4 [00:45<00:45,  0.04it/s, v_num=13]Num Actions: 130
tensor([13.8789], dtype=torch.float64)
Num Actions: 130
tensor([294.6423], dtype=torch.float64)
Padded Forward Probs torch.Size([2, 52, 1, 130])
Loss: 165.34445190429688


/Users/tonylizza/opt/anaconda3/envs/ML_new/lib/python3.12/site-packages/pytorch_lightning/utilities/data.py:78: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 24. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.


Epoch 0:  75%|███████▌  | 3/4 [00:59<00:19,  0.05it/s, v_num=13]Num Actions: 297
[Sampled 100 actions] CPU Memory Usage: 648.71 MB; VMS: 36067.50 MB
100
tensor([250.3973], dtype=torch.float64)
Num Actions: 297
tensor([74.4130], dtype=torch.float64)
Padded Forward Probs torch.Size([2, 148, 1, 297])
Loss: 144.34507751464844


/Users/tonylizza/opt/anaconda3/envs/ML_new/lib/python3.12/site-packages/pytorch_lightning/utilities/data.py:78: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 20. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.


Epoch 1:   0%|          | 0/4 [00:00<?, ?it/s, v_num=13]        Num Actions: 119
tensor([401.7176], dtype=torch.float64)
Num Actions: 119
tensor([378.4015], dtype=torch.float64)
Padded Forward Probs torch.Size([2, 59, 1, 119])
Loss: 34.40227127075195
Epoch 1:  25%|██▌       | 1/4 [00:37<01:52,  0.03it/s, v_num=13]Num Actions: 297
[Sampled 100 actions] CPU Memory Usage: 749.33 MB; VMS: 36024.70 MB
100
tensor([250.3690], dtype=torch.float64)
Num Actions: 297
[Sampled 100 actions] CPU Memory Usage: 761.24 MB; VMS: 36054.70 MB
100
tensor([250.3076], dtype=torch.float64)
Padded Forward Probs torch.Size([2, 148, 1, 297])
Loss: 29.8472843170166
Epoch 1:  50%|█████     | 2/4 [02:29<02:29,  0.01it/s, v_num=13]Num Actions: 130
tensor([383.6275], dtype=torch.float64)
Num Actions: 130
tensor([416.3603], dtype=torch.float64)
Padded Forward Probs torch.Size([2, 64, 1, 130])
Loss: 35.270687103271484
Epoch 1:  75%|███████▌  | 3/4 [03:31<01:10,  0.01it/s, v_num=13]Num Actions: 151
tensor([297.8815], dt

`Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: 100%|██████████| 4/4 [04:06<00:00,  0.02it/s, v_num=13]


  0%|          | 0/1 [00:00<?, ?it/s]

Type of b_vector: <class 'numpy.ndarray'>
GMRES converged successfully.
GMRES no preconditioner
GMRES did not converge. Exit code: tensor([30])
GMRES orig preconditioner
GMRES did not converge. Exit code: tensor([30])
GMRES sparse preconditioner
GMRES did not converge. Exit code: tensor([30])
GMRES sparse preconditioner
GMRES did not converge. Exit code: tensor([30])
GMRES sparse preconditioner
GMRES did not converge. Exit code: tensor([30])
GMRES sparse preconditioner
GMRES did not converge. Exit code: tensor([30])
GMRES sparse preconditioner
GMRES did not converge. Exit code: tensor([30])
GMRES sparse preconditioner
GMRES did not converge. Exit code: tensor([30])
GMRES sparse preconditioner
GMRES did not converge. Exit code: tensor([30])
GMRES sparse preconditioner
GMRES did not converge. Exit code: tensor([30])
GMRES sparse preconditioner


100%|██████████| 1/1 [09:19<00:00, 559.20s/it]

GMRES did not converge. Exit code: tensor([30])
GMRES sparse preconditioner
Validation results saved to validation_results_20241022135859.csv
Logged validation results after training epoch 2



GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name            | Type           | Params | Mode 
-----------------------------------------------------------
0 | forward_policy  | ForwardPolicy  | 900 K  | train
1 | backward_policy | BackwardPolicy | 900 K  | train
-----------------------------------------------------------
1.8 M     Trainable params
0         Non-trainable params
1.8 M     Total params
7.201     Total estimated model params size (MB)
17        Modules in train mode
0         Modules in eval mode


Epoch 0:   0%|          | 0/4 [00:00<?, ?it/s] Num Actions: 151
tensor([366.6802], dtype=torch.float64)
Num Actions: 151
tensor([350.8768], dtype=torch.float64)
Padded Forward Probs torch.Size([2, 75, 1, 151])
Loss: 46.79605484008789
Epoch 0:  25%|██▌       | 1/4 [02:11<06:35,  0.01it/s, v_num=0]Num Actions: 119
tensor([381.9548], dtype=torch.float64)
Num Actions: 119
tensor([378.8410], dtype=torch.float64)
Padded Forward Probs torch.Size([2, 59, 1, 119])
Loss: 35.65675735473633
Epoch 0:  50%|█████     | 2/4 [03:51<03:51,  0.01it/s, v_num=0]Num Actions: 130
tensor([419.3176], dtype=torch.float64)
Num Actions: 130
tensor([293.0187], dtype=torch.float64)
Padded Forward Probs torch.Size([2, 64, 1, 130])
Loss: 92.41976165771484
Epoch 0:  75%|███████▌  | 3/4 [05:15<01:45,  0.01it/s, v_num=0]Num Actions: 297
[Sampled 100 actions] CPU Memory Usage: 666.93 MB; VMS: 36115.82 MB
100


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x1109b9790>>
Traceback (most recent call last):
  File "/Users/tonylizza/opt/anaconda3/envs/ML_new/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 
