In [None]:
# Imports
import io
import os
import glob
import torch
import base64
import stable_baselines3
from torch import nn
from torch.functional import F
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from stable_baselines3 import DQN
from stable_baselines3.dqn.policies import CnnPolicy

import gym

from gym.wrappers import Monitor
from stable_baselines3.common.callbacks import EvalCallback


import pandas as pd
from pathlib import Path

from helper_functions import wrap_env, extract_information

## Extract information from pretrained model on random Init

In [None]:
log_dir = './log/'
Path(log_dir).mkdir(exist_ok=True, parents=True)
nn_layers = [64,64] 
learning_rate = 0.001 

# Create environment
env = gym.make('LunarLander-v2')
#You can also load other environments like cartpole, MountainCar, Acrobot. Refer to https://gym.openai.com/docs/ for descriptions.
#For example, if you would like to load Cartpole, just replace the above statement with "env = gym.make('CartPole-v1')".

policy_kwargs = dict(activation_fn=torch.nn.ReLU,
                  net_arch=nn_layers)
model = DQN("MlpPolicy", #CnnPolicy,
            
            env,policy_kwargs = policy_kwargs,
          learning_rate=learning_rate,
          batch_size=1,  #for simplicity, we are not doing batch update.
          buffer_size=1, #size of experience of replay buffer. Set to 1 as batch update is not done
          learning_starts=1, #learning starts immediately!
          gamma=0.99, #discount facto. range is between 0 and 1.
          tau = 1,  #the soft update coefficient for updating the target network
          target_update_interval=1, #update the target network immediately.
          train_freq=(1,"step"), #train the network at every step.
          max_grad_norm = 10, #the maximum value for the gradient clipping
          exploration_initial_eps = 1, #initial value of random action probability
          exploration_fraction = 0.5, #fraction of entire training period over which the exploration rate is reduced
          gradient_steps = 1, #number of gradient steps
          seed = 1, #seed for the pseudo random generators
          verbose=1) #Set verbose to 1 to observe training logs. We encourage you to set the verbose to 1.
model = model.load('../models/laura_best_panda.zip')

In [None]:
env = gym.make('PandaLander-v2', observe_state=True,
                                    random_initial_x=True)


extract_information(model, env, './original/', 100, save_video = False)

# Transfer learning on random init of previous model 

In [None]:
learning_rate = 1e-10

env = gym.make('PandaLander-v2',
               observe_state=True,
               random_initial_x=True)
callback = EvalCallback(env,log_path = log_dir, deterministic=True) #For evaluating the performance of the agent periodically and logging the results.
policy_kwargs = dict(activation_fn=torch.nn.ReLU,
                      net_arch=[512,512])
model = DQN("MlpPolicy", 
            
            env,policy_kwargs = policy_kwargs,
          learning_rate=learning_rate,
          batch_size=32,  #for simplicity, we are not doing batch update.
          buffer_size=1, #size of experience of replay buffer. Set to 1 as batch update is not done
          learning_starts=1, #learning starts immediately!
          gamma=0.99, #discount facto. range is between 0 and 1.
          tau = 1,  #the soft update coefficient for updating the target network
          target_update_interval=1, #update the target network immediately.
          train_freq=(1,"step"), #train the network at every step.
          max_grad_norm = 10, #the maximum value for the gradient clipping
          exploration_initial_eps = 1, #initial value of random action probability
          exploration_fraction = 0.5, #fraction of entire training period over which the exploration rate is reduced
          gradient_steps = 1, #number of gradient steps
          seed = 1, #seed for the pseudo random generators
          verbose=1) #Set verbose to 1 to observe training logs. We encourage you to set the verbose to 1.

model_saved = model.load('../models/laura_best_panda.zip')
model.set_parameters(model_saved.get_parameters())

model.learning_rate = learning_rate

In [None]:
model.learn(total_timesteps=100_000, log_interval=10, callback=callback)

In [None]:
env = gym.make('PandaLander-v2', observe_state=True,
                                    random_initial_x=True)


extract_information(model, env, './transfer/', 100, save_video = False)

In [None]:
model.save('../models/transfered.zip')