In [812]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict
import random
import time
import tqdm
import pickle
from sklearn.model_selection import train_test_split

In [492]:
data = pd.read_csv('data/sample_df.csv')
sequence_df = pd.read_csv('data/preprocessed_df.csv')
embeddings_df = pd.read_csv('data/embeddings.csv')

In [518]:
def file_convert(sequence_data):
    """Seperate ratings and items"""
    for col in ['state', 'next_state', 'action_reward']:
        sequence_data[col] = [np.array([[k for k in ee.split('&')] for ee in e.split('|')]) for e in sequence_data[col]]
    for col in ['state', 'next_state']:
        sequence_data[col] = [np.array([e[0] for e in l]) for l in sequence_data[col]]

    sequence_data['action'] = [[e[0] for e in l] for l in sequence_data['action_reward']]
    sequence_data['reward'] = [tuple(e[1] for e in l) for l in sequence_data['action_reward']]
    sequence_data.drop(columns=['action_reward'], inplace=True)
    return sequence_data   
def read_embeddings(embeddings_file):
    '''Load embeddings (a vector for each item).''' 
    return np.array([[np.float64(k) for k in e.split('|')]
                   for e in embeddings_file['embedding']])
def create_item_mappings(embeddings_df):
    """since items are strings, map them with integer index"""
    item_mappings_dict = {}
    for _,row in embeddings_df.iterrows():
        item_mappings_dict[int(_)] = row['item']
    return item_mappings_dict

In [519]:
"""helper class for reading embeddings"""
class Embeddings:
    def __init__(self, item_embeddings,item_mapping_dict):
        self.item_embeddings = item_embeddings
        self.item_mapping_dict = item_mapping_dict
        self.index_mapping_dict = {}
        for key, val in self.item_mapping_dict.items():
            self.index_mapping_dict[val] = key
  
    def size(self):
        return self.item_embeddings.shape[1]

    def get_embedding_vector(self):
        return self.item_embeddings

    def get_embedding(self, idx):
        if isinstance(idx, str):
            index = index_mapping_dict[idx]
            return self.item_embeddings[index]
        else:
            return self.item_embeddings[idx]

    def embed(self, item_list):
        return np.array([self.get_embedding(idx) for idx,item in enumerate(item_list)])

### Environment/Simulator

In [520]:
import gym
from gym import Env, spaces
from gym.utils import seeding
from gym.envs.registration import register

In [521]:
import sys
sys.modules[__name__]

<module '__main__'>

In [522]:
def register_env():
    register(id="recsys-v0",entry_point = "__main__:recsys") #change module (__main__) when converting to python script

In [551]:
class recsys(Env):
    def __init__(self,data,embeddings,alpha,gamma,fixed_length):
        self.embeddings = embeddings
        self.embedded_data = pd.DataFrame()
        self.embedded_data['state'] = [np.array([embeddings.get_embedding(item_id)
                                                for item_id in row['state']]) for _,row in data.iterrows()]
        self.embedded_data['action'] = [np.array([embeddings.get_embedding(item_id) for item_id in row['action']])
                                       for _,row in data.iterrows()]
        self.embedded_data['reward'] = data['reward']
        self.alpha = alpha
        self.gamma = gamma
        self.fixed_length = fixed_length
        self.current_state = self.reset()
        self.groups = self.get_groups()
    
    def reset(self):
        self.init_state = self.embedded_data['state'].sample(1).values[0]
        return self.init_state
    def step(self,actions):
        '''
        Compute reward and update state.
        Args:
          actions: embedded chosen items.
        Returns:
          cumulated_reward: overall reward.
          current_state: updated state.
        '''
        #compute overall reward according to equation 4 in RL Listwise recommender paper
        simulated_rewards, cumulative_rewards = self.simulate_rewards(self.current_state.reshape((1, -1)), actions.reshape((1, -1)))
        #Simulator memory from algorithm 1
        for k in range(len(simulated_rewards)):
            if float(simulated_rewards[k]) > 0.0: #if positive reward then append action to the end of current state
                self.current_state = np.append(self.current_state,[actions[k]],axis = 0)
                if self.fixed_length:
                    #remove the first item from current_state to keep the simulator memory constant
                    self.current_state = np.delete(self.current_state,0,axis = 0)
        return cumulative_rewards, self.current_state,False,{}
       
    def get_groups(self):
        """calculate average state action value for each group in dataframe rewards (eqn 3)"""
        groups = []
        for rewards,group in self.embedded_data.groupby(['reward']):
            size = group.shape[0]
            states = np.array(list(group['state'].values))
            actions = np.array(list(group['action'].values))
            groups.append({
                'size':size, #Nx
                'rewards': rewards, # U_x (combination of rewards)
                'average state': (np.sum(states / np.linalg.norm(states, 2, axis=1)[:, np.newaxis], axis=0) / size).reshape((1, -1)), # s_x^-
                'average action': (np.sum(actions / np.linalg.norm(actions, 2, axis=1)[:, np.newaxis], axis=0) / size).reshape((1, -1)) # a_x^-
            })                
        return groups
    
    def simulate_rewards(self,current_state,action): 
        #we'll assume only one reward type which is grouped cosine according to the RL paper
        '''
        Calculate simulated rewards.
        Args:
          current_state: history, list of embedded items.
          action: embedded chosen item.
        Returns:
          returned_rewards: argmax of probable rewards
          cumulated_reward: probability weighted rewards.
        '''
        def cosine_state_action(s_t, a_t, s_i, a_i):
            #Calculate cosine similarity between (state,action) pair
            cosine_state = np.dot(s_t, s_i.T) / (np.linalg.norm(s_t, 2) * np.linalg.norm(s_i, 2))
            cosine_action = np.dot(a_t, a_i.T) / (np.linalg.norm(a_t, 2) * np.linalg.norm(a_i, 2))
            return (self.alpha * cosine_state + (1 - self.alpha) * cosine_action).reshape((1,))
        
        #Calculate simulated rewards by grouped cosine (equation 1 and 3)
        probabilities = [cosine_state_action(current_state, action, g['average state'], g['average action']) 
                         for g in self.groups]
        #normalize probabilities to 1
        probabilities = np.array(probabilities)/sum(probabilities)
        returned_rewards = self.groups[np.argmax(probabilities)]['rewards']
        def overall_reward(rewards,gamma):
            return np.sum([(gamma**k) * float(reward) for k, reward in enumerate(rewards)])
        # Get probability weighted cumulated reward
        cumulated_reward = np.sum([p * overall_reward(g['rewards'], self.gamma) 
                                   for p, g in zip(probabilities, self.groups)])
        
        return returned_rewards, cumulated_reward
        
    
    

In [525]:
#Further steps -> coding actor and critic network, replay memory, train functions and evaluations

### Policy function approximator --> Actor Network

In [603]:
class Actor():
    """Policy Function Approximator"""
    def __init__(self,session,state_space_size,action_space_size,batch_size,embedding_size,\
                 tau,actor_learning_rate,action_len=1,history_len=10,scope='actor'):
        self.session = session
        self.state_space_size = state_space_size
        self.action_space_size = action_space_size
        self.batch_size = batch_size
        self.embedding_size = embedding_size
        self.tau = tau
        self.actor_lr = actor_learning_rate
        self.action_len = action_len
        self.history_len = history_len
        self.scope = scope
        with tf.compat.v1.variable_scope(self.scope):
            #Build estimator actor network
            self.action_weights,self.state,self.sequence_length = self.build_net('estimate_actor')
            self.network_params = tf.compat.v1.trainable_variables()
            #Build target network
            self.target_action_weights,self.target_state,self.target_sequence_len = self.build_net('target_actor')
            #get network parameters for target_actor network
            self.target_network_params =tf.compat.v1.trainable_variables()[len(self.network_params):]

            # Initialize target network weights with network weights (θ^π′ ← θ^π)
            self.init_target_network_params = [self.target_network_params[i].assign(self.network_params[i])
            for i in range(len(self.target_network_params))]

            # Update target network weights (θ^π′ ← τθ^π + (1 − τ)θ^π′)
            self.update_target_network_params = [self.target_network_params[i].assign(
                tf.multiply(self.tau, self.network_params[i]) +
                tf.multiply(1 - self.tau, self.target_network_params[i])) 
                                                for i in range(len(self.target_network_params))]

            self.action_gradient = tf.compat.v1.placeholder(tf.float32,[None,self.action_space_size])
            gradients = tf.gradients(tf.reshape(self.action_weights,[self.batch_size,self.action_space_size]),\
                                     self.network_params,self.action_gradient)
            self.params_gradient = list(map(
                lambda x: tf.compat.v1.div(x,self.batch_size * self.action_space_size),gradients
            ))
            # Compute ∇_a.Q(s, a|θ^µ).∇_θ^π.f_θ^π(s)
            self.optimizer = tf.compat.v1.train.AdamOptimizer(self.actor_lr).apply_gradients(
                zip(self.params_gradient, self.network_params)
            )
            self.num_trainable_vars = len(self.network_params) + len(self.target_network_params)
    

    def build_net(self,scope):
        """Build Tensorflow Graph"""
        def gather_last_output(data, seq_lens):
            def cli_value(x, v):
                y = tf.constant(v, shape=x.get_shape(), dtype=tf.int64)
                x = tf.cast(x, tf.int64)
                return tf.where(tf.greater(x, y), x, y)
            batch_range = tf.range(tf.cast(tf.shape(data)[0], dtype=tf.int64), dtype=tf.int64)
            tmp_end = tf.map_fn(lambda x: cli_value(x, 0), seq_lens - 1, dtype=tf.int64)
            indices = tf.stack([batch_range, tmp_end], axis=1)
            return tf.gather_nd(data, indices)  
        
        with tf.compat.v1.variable_scope(scope):
            state = tf.compat.v1.placeholder(tf.float32,[None,self.state_space_size],"state")
            state_ = tf.reshape(state,[-1,self.history_len,self.embedding_size])
            sequence_length = tf.compat.v1.placeholder(tf.int32,[None],'sequence_length')
            cell = tf.compat.v1.nn.rnn_cell.GRUCell(self.embedding_size,
                                                   activation = tf.nn.relu,
                                                   kernel_initializer = tf.initializers.random_normal(),
                                                   bias_initializer = tf.zeros_initializer())
            outputs,_ = tf.compat.v1.nn.dynamic_rnn(cell,state_,dtype=tf.float32,sequence_length=sequence_length)
            last_output = gather_last_output(outputs,sequence_length)
            x = tf.keras.layers.Dense(self.action_len * self.embedding_size)(last_output)
            action_weights = tf.reshape(x,[-1,self.action_len,self.embedding_size])
        return action_weights, state, sequence_length
    def train(self,state,sequence_length,action_gradients):
        """
      Compute ∇_a.Q(s, a|θ^µ).∇_θ^π.f_θ^π(s)
        """
        self.session.run(self.optimizer,feed_dict={
            self.state:state,
            self.sequence_length:sequence_length,
            self.action_gradient:action_gradients
        })
        
    def predict(self,state,sequence_length):
        return self.session.run(self.action_weights,feed_dict={
            self.state:state,
            self.sequence_length:sequence_length
        })
    def predict_target(self,state,sequence_length):
        return self.session.run(self.target_action_weights,feed_dict={
            self.target_state: state,
            self.target_sequence_len:sequence_length
        })
    def init_target_network(self):
        self.session.run(self.init_target_network_params)
    def update_target_network(self):
        self.session.run(self.update_target_network_params)
    def get_num_trainable_vars(self):
        return self.num_trainable_vars
    
    def get_recommendation(self,action_len,noisy_state,embeddings,target=False):
        """
        Algorithm 2 from Listwise Recommendation Paper
        Args:
        action_len: length of recommendation list (K)
        noisy_state: environment state with noise
        embeddings: Embeddings class object
        target: boolean to indicate use of Actor network or Target Network
        
        Returns:
        Recommendation: list of embedded item as future actions
        """
        def get_score(weights,embedding,batch_size):
            return np.dot(weights,embedding.T)
        
        batch_size = noisy_state.shape[0]
        method = self.predict_target if target else self.predict
        weights = method(noisy_state,[action_len]*batch_size)
        
        scores = np.array([[[get_score(weights[i][j],embedding,batch_size)
                            for embedding in embeddings.get_embedding_vector()]
                           for j in range(action_len)]
                          for i in range(batch_size)])
        return np.array([[embeddings.get_embedding(np.argmax(scores[i][j]))
                         for j in range(action_len)]
                        for i in range(batch_size)])

    

### Value function approximator --> Critic Network

In [604]:
class Critic():
    """Value Function Approximator"""
    def __init__(self,session,state_space_size,action_space_size,embedding_size,\
                 tau,critic_learning_rate,history_len=10,scope='critic'):
        self.session = session
        self.state_space_size = state_space_size
        self.action_space_size = action_space_size
        self.embedding_size = embedding_size
        self.tau = tau
        self.learning_rate = critic_learning_rate
        self.history_len = history_len
        self.scope = scope
        with tf.compat.v1.variable_scope(self.scope):
            #Build critic Network
            self.critic_Q_value,self.state,self.action,self.sequence_length = self.build_net('estimator_critic')
            self.network_params = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES,\
                                                              scope='estimator_critic')
            
            #Build target Critic Network
            self.target_Q_value,self.target_state,self.target_action,\
            self.target_sequence_length = self.build_net('target_critic')
            self.target_network_params = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES,\
                                                           scope='target_critic')
            
            #Initialize target network weights with network weights
            self.init_target_network_params = [self.target_network_params[i].assign(self.network_params[i]) 
                                               for i in range(len(self.target_network_params))]
            #Update Target network weights
            self.update_target_network_params = [self.target_network_params[i].assign(
                tf.multiply(self.tau, self.network_params[i]) + 
                tf.multiply(1 - self.tau, self.target_network_params[i]))
                for i in range(len(self.target_network_params))]
            
            #Minimize MSE between critic's Q values and target critic's output Q values
            self.expected_reward = tf.compat.v1.placeholder(tf.float32,[None,1])
            self.loss = tf.reduce_mean(tf.math.squared_difference(self.expected_reward,self.critic_Q_value))
            self.optimizer = tf.compat.v1.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
            
            #Compute action gradients  ∇_a.Q(s, a|θ^µ)
            self.action_gradients = tf.gradients(self.critic_Q_value,self.action)
    
    def build_net(self,scope):
        # Inputs: current state, current action
        # Outputs: predicted Q-value
        
        def gather_last_output(data, seq_lens):
            def cli_value(x, v):
                y = tf.constant(v, shape=x.get_shape(), dtype=tf.int64)
                return tf.where(tf.greater(x, y), x, y)
            this_range = tf.range(tf.cast(tf.shape(seq_lens)[0], dtype=tf.int64), dtype=tf.int64)
            tmp_end = tf.map_fn(lambda x: cli_value(x, 0), seq_lens - 1, dtype=tf.int64)
            indices = tf.stack([this_range, tmp_end], axis=1)
            return tf.gather_nd(data, indices)

        with tf.compat.v1.variable_scope(scope):
            state = tf.compat.v1.placeholder(tf.float32,[None,self.state_space_size],'state')
            state_ = tf.reshape(state, [-1, self.history_len, self.embedding_size])
            action = tf.compat.v1.placeholder(tf.float32, [None, self.action_space_size], 'action')
            sequence_length = tf.compat.v1.placeholder(tf.int64, [None], name='critic_sequence_length')
            cell = tf.compat.v1.nn.rnn_cell.GRUCell(self.history_len,
                                        activation=tf.nn.relu,
                                        kernel_initializer=tf.initializers.random_normal(),
                                        bias_initializer=tf.zeros_initializer())
            predicted_state, _ = tf.compat.v1.nn.dynamic_rnn(cell, state_, dtype=tf.float32, sequence_length=sequence_length)
            predicted_state = gather_last_output(predicted_state, sequence_length)
            
            inputs = tf.concat([predicted_state, action], axis=-1)
            layer1 = tf.keras.layers.Dense(32, activation=tf.nn.relu)(inputs)
            layer2 = tf.keras.layers.Dense(16, activation=tf.nn.relu)(layer1)
            critic_Q_value = tf.keras.layers.Dense(1)(layer2)
        return critic_Q_value, state, action, sequence_length            
        
    def train(self,state,action,sequence_length,expected_reward):
        """
        MINIMIZE MSE between target critic Q val and expected reward 
        """
        return self.session.run([self.critic_Q_value,self.loss,self.optimizer],
                            feed_dict={
                                self.state:state,
                                self.action:action,
                                self.sequence_length:sequence_length,
                                self.expected_reward:expected_reward
                            })
    def predict(self,state,action,sequence_length):
        """
        Return critic's predicted Q val
        """
        return self.session.run(self.critic_Q_value,
                               feed_dict={
                                   self.state:state,
                                   self.action:action,
                                   self.sequence_length: sequence_length
                               })
    def predict_target(self, state, action, sequence_length):
        """ 
        Returns target Critic's predicted Q-value. 
        """
        return self.session.run(self.target_Q_value,
                             feed_dict={
                                 self.target_state: state,
                                 self.target_action: action,
                                 self.target_sequence_length: sequence_length
                             })
    def get_action_gradients(self, state, action, sequence_length):
        """
        Returns ∇_a.Q(s,a|θ^µ)
        """
        return np.array(self.session.run(self.action_gradients,
                             feed_dict={
                                 self.state: state,
                                 self.action: action,
                                 self.sequence_length: sequence_length
                             })[0])
    
    def init_target_network(self):
        self.session.run(self.init_target_network_params)
    
    def update_target_network(self):
        self.session.run(self.update_target_network_params)
            
            

### Replay

In [605]:
class ReplayMemory():
    """
    Replay Memory D in Listwise Recommendation Paper
    """
    def __init__(self,buffer_size):
        self.buffer_size = buffer_size
        self.buffer = []
    def add(self,state,action,reward,n_state):
        self.buffer.append([state,action,reward,n_state])
        if len(self.buffer) > self.buffer_size:
            self.buffer.pop(0)
    def size(self):
        return len(self.buffer)
    def sample_batch(self,batch_size):
        return random.sample(self.buffer,batch_size)
    

In [606]:
def experience_replay(replay_memory,batch_size,actor,critic,\
                      embeddings,action_len,state_space_size,action_space_size,gamma):
    """
    Experience Replay
Args:
    replay_memory: replay_memory class object
    batch_size: sample_size
    actor: Actor Network
    critic: Critic Network
    embeddings: Embeddings class object
    state_space_size: dimension of states
    action_space_size: dimension of actions
Returns:
    Best Q-value, loss of critic network
    """
    samples = replay_memory.sample_batch(batch_size)
    states = np.array([s[0] for s in samples])
    actions = np.array([s[1] for s in samples])
    rewards = np.array([s[2] for s in samples])
    n_states = np.array([s[3] for s in samples]).reshape(-1, state_space_size)

  # '23: Generate a′ by target Actor network according to Algorithm 2'
    n_actions = actor.get_recommendation(action_len, states, embeddings, target=True).reshape(-1, action_space_size)

  # Calculate predicted Q′(s′, a′|θ^µ′) value
    target_Q_value = critic.predict_target(n_states, n_actions, [action_len] * batch_size)

  # '24: Set y = r + γQ′(s′, a′|θ^µ′)'
    expected_rewards = rewards + gamma * target_Q_value
  
  # '25: Update Critic by minimizing (y − Q(s, a|θ^µ))²'
    critic_Q_value, critic_loss, _ = critic.train(states, actions, [action_len] * batch_size, expected_rewards)
  
  # '26: Update the Actor using the sampled policy gradient'
    action_gradients = critic.get_action_gradients(states, n_actions, [action_len] * batch_size)
    actor.train(states, [action_len] * batch_size, action_gradients)

  # '27: Update the Critic target networks'
    critic.update_target_network()

  # '28: Update the Actor target network'
    actor.update_target_network()

    return np.amax(critic_Q_value), critic_loss

### NOISE

In [607]:
class OrnsteinUhlenbeckNoise:
    """Noise for Actor Predictions"""
    def __init__(self,action_space_size,mu=0,theta=0.5,sigma=0.2):
        self.action_space_size = action_space_size
        self.mu = mu
        self.theta = theta
        self.sigma = sigma
        self.state = np.ones(self.action_space_size) * self.mu
    def get(self):
        self.state += self.theta * (self.mu - self.state) + self.sigma*np.random.rand(self.action_space_size)
        return self.state

### Training

In [834]:
def train_network(session,env,actor,critic,embeddings,history_len,action_len,buffer_size,batch_size,\
         gamma,nb_episodes,nb_rounds,filename_summary):
    """Algorithm 3 in Listwise Recommendation Paper"""
    session.run(tf.compat.v1.global_variables_initializer())    

    actor.init_target_network()
    critic.init_target_network()
    
    replay_memory = ReplayMemory(buffer_size)
    replay = False
    
    start_time = time.time()
    summary_dict = {}
    for i_session in range(nb_episodes):
        session_reward = 0
        session_Q_value = 0
        session_critic_loss = 0
        summary_dict[i_session] = {}
        summary_dict[i_session]["session_reward"] = []
        summary_dict[i_session]["session_Q_value"] = []
        summary_dict[i_session]["session_critic_loss"] = []
        states = env.reset()#Initialize state s0 from previous sessions
        if (i_session + 1) % 10 == 0: #Update average parameters every 10 episodes
            env.groups = env.get_groups()
        
        
        for t in tqdm.trange(nb_rounds):
            #select actions according to get recommendation list
            exploration_noise = OrnsteinUhlenbeckNoise(len(states) * embeddings.size())
            actions = actor.get_recommendation(action_len,\
                                               states.reshape(1,-1) + exploration_noise.get().reshape(1,-1),
                                               embeddings
                                              ).reshape(action_len,embeddings.size())
            rewards,next_states,done,_ = env.step(actions)
            
            replay_memory.add(states.reshape(history_len * embeddings.size()),
                             actions.reshape(action_len * embeddings.size()),
                             [rewards],
                             next_states.reshape(len(next_states)*embeddings.size()))
            states = next_states
            session_reward += rewards
            
            #parameter update
            if replay_memory.size() >= batch_size:
                replay = True
                replay_Q_value, critic_loss = experience_replay(replay_memory,batch_size,
                                                               actor,critic,embeddings,action_len,\
                                                               history_len * embeddings.size(),
                                                               action_len * embeddings.size(),
                                                               gamma)
                session_Q_value += replay_Q_value
                session_critic_loss += critic_loss
                summary_dict[i_session]["session_Q_value"].append(session_Q_value)
                summary_dict[i_session]["session_critic_loss"].append(session_critic_loss)                
            summary_dict[i_session]["session_reward"].append(session_reward)

        
        
#         print("Session Reward: {}, Session_Q_value: {}, Session_critic_loss: {}".format(session_reward,
#                                                                                        session_Q_value,
#                                                                                        session_critic_loss))

        str_loss = str('Loss=%0.4f' % session_critic_loss)
        print(('Episode %d/%d Reward=%d Time=%ds ' + (str_loss if replay else 'No replay')) % (i_session + 1, nb_episodes, session_reward, time.time() - start_time))
        start_time = time.time()
    with open(filename_summary, 'wb') as f:
        pickle.dump(summary_dict, f)
    tf.compat.v1.train.Saver().save(session,'models.h5',write_meta_graph=False)

In [835]:
#Hyperparams
history_len = 10
action_len = 1
discount_factor = 0.99
actor_lr = 0.0001
critic_lr = 0.001
tau = 0.001
batch_size = 64
nb_episodes = 100
nb_rounds = 50
filename_summary = 'summary.pkl'
alpha = 0.5
gamma = 0.9
buffer_size = 1000000
fixed_len = True
state_space_size = embeddings.size() * history_len
action_space_size = embeddings.size() * action_len
data = file_convert(sequence_df.copy())
train, test = train_test_split(data, test_size=0.25,random_state=0)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [836]:
#Test code
item_map = create_item_mappings(embeddings_df)
embeddings = Embeddings(read_embeddings(embeddings_df),item_map)
register_env()
env = gym.make('recsys-v0',data = data,embeddings=embeddings,alpha=alpha,gamma=gamma,fixed_length=fixed_len)

In [837]:
tf.compat.v1.reset_default_graph()
session = tf.compat.v1.Session()
tf.compat.v1.disable_eager_execution()
# Initialize actor network f_θ^π and critic network Q(s, a|θ^µ) with random weights
actor = Actor(session, state_space_size, action_space_size, batch_size, embeddings.size(),tau,actor_lr,\
              action_len, history_len)
critic = Critic(session, state_space_size, action_space_size, embeddings.size(),\
                tau, critic_lr,history_len)

2021-12-03 15:57:25.642279: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-12-03 15:57:25.642371: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [None]:
train_network(session, env, actor, critic, embeddings, history_len, action_len,\
      buffer_size,batch_size,discount_factor,nb_episodes,nb_rounds,filename_summary=filename_summary)

2021-12-03 15:57:27.514391: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-12-03 15:57:27.751654: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
  0%|                                                                                                                                            | 0/50 [00:00<?, ?it/s]2021-12-03 15:57:27.801665: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:02<00:00, 24.17it/s]


Episode 1/100 Reward=150 Time=2s No replay


 26%|██████████████████████████████████                                                                                                 | 13/50 [00:00<00:01, 29.49it/s]2021-12-03 15:57:30.361129: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-12-03 15:57:32.190865: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-12-03 15:57:32.565771: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-12-03 15:57:33.225180: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-12-03 15:57:33.645050: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-12-03 15:57:34.188145: I tensorflow/core/grappler/op

Episode 2/100 Reward=150 Time=101s Loss=161.8500


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [02:11<00:00,  2.62s/it]


Episode 3/100 Reward=149 Time=131s Loss=15.3950


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [02:10<00:00,  2.61s/it]


Episode 4/100 Reward=99 Time=130s Loss=197.7175


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [02:15<00:00,  2.71s/it]


Episode 5/100 Reward=150 Time=135s Loss=325.0724


 88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎               | 44/50 [01:59<00:18,  3.05s/it]

In [790]:
with open('summary.pkl', 'rb') as f:
        loaded_dict = pickle.load(f)