In [None]:
import sys
import os
import pandas as pd
import numpy as numpy
import random
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

from gymnasium.spaces import Box
from src.state_representation import DenseAutoEncoder, BaseForwardModel, CombinedModel, BaseInverseModel, BaseRewardModel
from stable_baselines3.common.buffers import ReplayBuffer

from src.ae_utils import (create_replay_buffer, populate_replay_buffer, get_action,
                        extract_data_from_buffer, prepare_dataloaders, train_combined_model)

In [None]:
# Instanciate your ENV, modify this function to return you env gymnasium obkect representing your env
env_name = '<ENV_NAME>'
n_agent = 15
random_seed = 0

env, agents, simulation_start, simulation_end = setup_environment(env_name, n_agent, random_seed)

In [None]:
# instanciate the replay buffer and add <SIZE> transitions 
size = <SIZE>

buffer = create_replay_buffer(env, n_agent, size)
populate_replay_buffer(env, buffer, size, n_agent)

In [None]:
# Define the type of model we want to use, here we are training with every auxiliary functions available
type_ae = 'forward_inverse_reward'

input_dim = env.observation_space.shape[0]
output_dim = input_dim
state_dim = 90 # modify the size of the latent space as you wish
action_dim = env.action_space.shape[0] # nb of actions from your env
batch_size = 64
learning_rate = 1e-3
num_epochs = 10
test_size = 0.2
validation_size = 0.1

use_next_states = False # we use the prediction of the forward model

plot_dir = f'./plots/{type_ae}/'
weights_dir = f'./saved_models/{type_ae}/state_dim_{state_dim}/'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Instanciante auxilary models
autoencoder = DenseAutoEncoder(input_dim, output_dim, state_dim).to(device)
forward_model = BaseForwardModel().to(device)
forward_model.initForwardNet(state_dim, action_dim, model_type='mlp')
inverse_model = BaseInverseModel().to(device)
inverse_model.initInverseNet(state_dim, action_dim, model_type="mlp")
reward_model = BaseRewardModel().to(device)
reward_model.initRewardNet(state_dim, model_type="mlp")
combined_model = CombinedModel(autoencoder, forward_model, inverse_model, reward_model).to(device)

# Loss and optimizer
reconstruction_criterion = nn.MSELoss()
prediction_criterion = nn.MSELoss()
inverse_criterion = nn.MSELoss()  # Assuming continuous actions
reward_criterion = nn.MSELoss()
optimizer = optim.Adam(combined_model.parameters(), lr=learning_rate)

In [None]:
(data_splits) = extract_data_from_buffer(buffer, input_dim, action_dim,
                                            test_size=0.2, validation_size=0.1)

train_loader, val_loader, test_loader = prepare_dataloaders(*data_splits, batch_size=batch_size)

In [None]:
train_combined_model_offline(combined_model, state_dim, train_loader, val_loader,
        test_loader, num_epochs, reconstruction_criterion, prediction_criterion,
        inverse_criterion, reward_criterion, optimizer, use_next_states,
        plot_dir, weights_dir)
