In [None]:
%matplotlib inline
import bisect
import copy 
import os 
from collections import deque, Counter
import random
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import editdistance
import sys
import RNA
from typing import Dict, List, Tuple

# import path 
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from utils.sequence_utils import translate_one_hot_to_string,generate_random_mutant
from utils.sequence_utils import translate_string_to_one_hot, translate_one_hot_to_string
from models.Theoretical_models import *
from models.Noise_wrapper import *
from utils.landscape_utils import *
from models.RNA_landscapes import *
from models.Multi_dimensional_model import *

import tensorflow as tf
from tf_agents.drivers import dynamic_step_driver
from tf_agents.metrics import tf_metrics
from tf_agents.agents import tf_agent
from tf_agents.policies import random_tf_policy
from tf_agents.agents.ppo import ppo_policy, ppo_agent, ppo_utils
from tf_agents.environments import py_environment, tf_py_environment
from tf_agents.environments.utils import validate_py_environment
from tf_agents.drivers import dynamic_episode_driver
from tf_agents.networks import network, normal_projection_network
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.utils import common
from tf_agents.trajectories import time_step as ts
from tf_agents.specs import array_spec

# Setup

In [None]:
RAA="UGCA" #alphabet
alphabet_len=len(RAA)
length=20
noise_alpha=1
generations = 10
experiment_batch_size = 100

wt=generate_random_sequences(length,1,alphabet=RAA)[0]
landscape=RNA_landscape(wt)
noisy_landscape=Noise_wrapper(landscape,
                              noise_alpha=noise_alpha,
                              always_costly=True)
initial_genotypes=list(set([wt]+[generate_random_mutant(wt,0.05,RAA) 
                                 for i in range(experiment_batch_size*10)]))[:experiment_batch_size]
noisy_landscape.reset()
noisy_landscape.measure_true_landscape(initial_genotypes)
noisy_landscape.natural_mode=False
noisy_landscape.local_mode=False
noisy_landscape.cost

In [None]:
class FitnessLandscapeEnvironment(py_environment.PyEnvironment):
    def __init__(self, alphabet, seq_len,
                 landscape, max_episodes):
        self.alphabet = alphabet
        self.alphabet_len = len(alphabet)
        # should we deepcopy here?
        self.landscape = copy.deepcopy(landscape)
        self.seq_len = seq_len
        
        # we really shouldn't have ints, since it doesn't make
        # sense to say "28" and "29" should be closer than
        # "28" and "40", but I'm just trying to get anything
        # to work...
        self._action_spec = array_spec.BoundedArraySpec(
            shape=(), 
            dtype=np.int32, 
            minimum=0, maximum=self.alphabet_len*self.seq_len-1, 
            name='action_x'
        )
        self._observation_spec = array_spec.BoundedArraySpec(
            shape=(self.alphabet_len, self.seq_len),
            dtype=np.int32,
            minimum=0, maximum=1,
            name='observation'
        )
        self._time_step_spec = ts.time_step_spec(
            self._observation_spec)
        # initialize state
        self._state = translate_string_to_one_hot(wt,
                                                  self.alphabet)

        # RL housekeeping
        self._episode_ended = False
        self.counter = 0
        self.max_episodes = max_episodes
        self.seen_sequences = {}
        
    def _reset(self):
        self.counter = 0
        # there's no reason why we should expect the model to do different, random moves for the same
        # starting point... so why not start at a different one?
#         self._state = translate_string_to_one_hot(wt, self.alphabet)
        self._state = translate_string_to_one_hot(generate_random_sequences(length,1,alphabet=RAA)[0], self.alphabet)
        self._episode_ended = False
        return ts.restart(np.array(self._state, dtype=np.int32))
    
    # spec housekeeping
    def time_step_spec(self):
        return self._time_step_spec
    def action_spec(self):
        return self._action_spec
    def observation_spec(self):
        return self._observation_spec
    
    def _step(self, action):
        if self.counter >= self.max_episodes:
            self._episode_ended = True
            return ts.termination(np.array(self._state,
                                           dtype=np.int32),
                                  reward=0)
        self.counter += 1
        
        print("action", action)
        # derive what base is modified in which position
        action_one_hot = np.zeros((self.alphabet_len,
                                  self.seq_len))
        base, pos = action//self.seq_len, action%self.seq_len
        action_one_hot[base, pos] = 1
        
        # if we're trying to make a no-op this is bad
        if self._state[base, pos] == 1:
            self._episode_ended = True
            return ts.termination(np.array(self._state,
                                           dtype=np.int32),
                                  reward=-1)
        
        self._state = construct_mutant_from_sample(action_one_hot,
                                                  self._state)
        state_string = translate_one_hot_to_string(self._state,
                                                   self.alphabet)
        
        # if we've seen the new state string before, end the episode
        if state_string in self.seen_sequences:
            return ts.termination(np.array(self._state,
                                           dtype=np.int32),
                                  reward=-1)
        self.seen_sequences[state_string] = 1
        
        reward = self.landscape.get_fitness(state_string)
        return ts.transition(np.array(self._state,
                                      dtype=np.int32),
                             reward=reward*100)

In [None]:
max_episodes = 10**6
fle = FitnessLandscapeEnvironment(RAA, length, landscape, max_episodes)
validate_py_environment(fle, episodes=1)
tf_env = tf_py_environment.TFPyEnvironment(fle)

In [None]:
# specs 
time_step_spec = tf_env.time_step_spec()
observation_spec = tf_env.observation_spec()
action_spec = tf_env.action_spec()
alphabet_len = len(RAA)
seq_len = length

In [None]:
# # run random agent for testing purposes only 
# random_policy = random_tf_policy.RandomTFPolicy(tf_env.time_step_spec(), tf_env.action_spec())
# random_collect_policy = random_tf_policy.RandomTFPolicy(tf_env.time_step_spec(), tf_env.action_spec())
# random_agent = tf_agent.TFAgent(
#     tf_env.time_step_spec(),
#     tf_env.action_spec(),
#     random_policy,
#     random_collect_policy,
#     None
# )

# collect_driver = dynamic_step_driver.DynamicStepDriver(
#     tf_env,
#     random_agent.collect_policy,
#     num_steps=100
# )

# collect_driver.run()

In [None]:
from tf_agents.networks import actor_distribution_network, value_network

actor_net = actor_distribution_network.ActorDistributionNetwork(
    tf_env.observation_spec(),
    tf_env.action_spec(),
    fc_layer_params=(128,)
)
value_net = value_network.ValueNetwork(
    tf_env.observation_spec(),
    fc_layer_params=(40, 5)
)

optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=1e-5)

agent = ppo_agent.PPOAgent(
    tf_env.time_step_spec(),
    tf_env.action_spec(),
    optimizer,
    actor_net=actor_net,
    value_net=value_net
)

In [None]:
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    agent.collect_data_spec,
    batch_size=tf_env.batch_size,
    max_length=100)

def collect_training_data():
    dynamic_step_driver.DynamicStepDriver(
        tf_env,
        agent.collect_policy,
        observers=[replay_buffer.add_batch],
        num_steps=10).run()
    
def train_agent():
    dataset = replay_buffer.as_dataset(
        sample_batch_size=100,
        num_steps=2)

    iterator = iter(dataset)

    loss = None
    for _ in range(10):
        trajectories, _ = next(iterator)
        loss = agent.train(experience=trajectories)

    print('Training loss: ', loss.loss.numpy())
    return loss.loss.numpy()

In [None]:
from tf_agents.metrics import tf_py_metric
from tf_agents.metrics import py_metric
from tf_agents.drivers import py_driver
from tf_agents.drivers import dynamic_episode_driver

class MaxEpisodeScoreMetric(py_metric.PyStepMetric):
    def __init__(self, name='MaxEpisodeScoreMetric'):
        super(py_metric.PyStepMetric, self).__init__(name)
        self.rewards = []
        self.discounts = []
        self.max_discounted_reward = None
        self.reset()
        
    def reset(self):
        self.rewards = []
        self.discounts = []
        self.max_discounted_reward = None
        
    def call(self, trajectory):
        self.rewards += trajectory.reward
        self.discounts += trajectory.discount
    
        if (trajectory.is_last()):
            adjusted_discounts = [1.0] + self.discounts # because a step has its value + the discount of the NEXT step (Bellman equation)
            adjusted_discounts = adjusted_discounts[:-1] # dropping the discount of the last step because it is not followed by a next step, so the value is useless
            discounted_reward = np.sum(np.multiply(self.rewards, adjusted_discounts))
            print(self.rewards, adjusted_discounts, discounted_reward)

            if self.max_discounted_reward == None:
                self.max_discounted_reward = discounted_reward

            if discounted_reward > self.max_discounted_reward:
                self.max_discounted_reward = discounted_reward
                
            self.rewards = []
            self.discounts = []
    
    def result(self):
        return self.max_discounted_reward
    
class TFMaxEpisodeScoreMetric(tf_py_metric.TFPyMetric):
    def __init__(self, name='MaxEpisodeScoreMetric', dtype=tf.float32):
        py_metric = MaxEpisodeScoreMetric()

        super(TFMaxEpisodeScoreMetric, self).__init__(
            py_metric=py_metric, name=name, dtype=dtype)

def evaluate_agent():
    max_score = TFMaxEpisodeScoreMetric() # a class from the article mentioned at the beginning
    observers = [max_score]
    driver = dynamic_episode_driver.DynamicEpisodeDriver(
        tf_env, agent.policy, observers, num_episodes=100)

    final_time_step, policy_state = driver.run()

    print('Max test score:', max_score.result().numpy())
    return max_score.result().numpy()

In [None]:
collect_training_data()
# train_agent()
dataset = replay_buffer.as_dataset(
        sample_batch_size=1000,
        num_steps=10)

In [None]:
for i in range(20):
    print('Step ', i)
    collect_training_data()
    train_agent()
    evaluate_agent()

In [None]:
# see how agent performs 
time_step = tf_env.reset()
for _ in range(50):
    tf_env.reset()
    episode_reward = 0
    episode_steps = 0
    while not tf_env.current_time_step().is_last():
        action = agent.collect_policy.action(tf_env.current_time_step()).action
        print('predicted action', action)
        next_time_step = tf_env.step(action)
        episode_steps += 1
        print("Reward", next_time_step.reward.numpy())
        episode_reward += next_time_step.reward.numpy()
    print("Steps:", episode_steps, "Reward:", episode_reward)

In [None]:
tf_policy = random_tf_policy.RandomTFPolicy(action_spec=tf_env.action_spec(),
                                            time_step_spec=tf_env.time_step_spec())

max_score = TFMaxEpisodeScoreMetric()

observers = [max_score]
driver = dynamic_episode_driver.DynamicEpisodeDriver(tf_env, tf_policy, observers, num_episodes=1000)

final_time_step, policy_state = driver.run()

print('Max score:', max_score.result().numpy())