In [None]:
import functools
import os
from absl import app
from absl import flags

import tensorflow as tf  # pylint: disable=g-explicit-tensorflow-version-import
from tf_agents.bandits.agents import dropout_thompson_sampling_agent as dropout_ts_agent
from tf_agents.bandits.agents import lin_ucb_agent
from tf_agents.bandits.agents import linear_thompson_sampling_agent as lin_ts_agent
from tf_agents.bandits.agents import neural_epsilon_greedy_agent as eps_greedy_agent
from tf_agents.bandits.agents.examples.v2 import trainer
from tf_agents.bandits.environments import environment_utilities
#from tf_agents.bandits.environments import movielens_per_arm_py_environment
from tf_agents.bandits.environments import movielens_py_environment
from tf_agents.metrics import tf_metrics
from tf_agents.bandits.metrics import tf_metrics as tf_bandit_metrics
from tf_agents.bandits.networks import global_and_arm_feature_network
from tf_agents.environments import tf_py_environment
from tf_agents.networks import q_network
from tf_agents.drivers import dynamic_step_driver
from tf_agents.eval import metric_utils
from tf_agents.policies import policy_saver
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import time_step as ts

In [None]:
# Create target Directory if don't exist
from datetime import date
today = date.today()
fdate = date.today().strftime('%d_%m_%Y')

root_path = os.getcwd() 
log_path = "{}/logs/{}".format(root_path, fdate)
if not os.path.exists(log_path):
    os.mkdir(log_path)
    print("Directory {} Created".format(fdate))
else:    
    print("Directory {} already exists".format(fdate))

print("Full path is {}".format(log_path))

In [None]:
# dowload movielens dataset from https://grouplens.org/datasets/movielens/
# !wget https://files.grouplens.org/datasets/movielens/ml-25m.zip ./18rl/data/ml-25m.zip

In [None]:
# initialize the movielens pyenvironment with default parameters
NUM_ACTIONS = 20 # take this as 20
RANK_K = 20 # take rank as 20
BATCH_SIZE = 8 # take batch size as 8
data_path = "./dataset/movielens.data"
# data_path = "gs://ta-reinforecement-learning/dataset/movielens.data" # specify the path to the movielens.data OR get it from the GCS bucket
env = movielens_py_environment.MovieLensPyEnvironment(
        data_path, RANK_K, BATCH_SIZE, num_movies=NUM_ACTIONS)
environment = tf_py_environment.TFPyEnvironment(env)

In [None]:
EPSILON = 0.05
LAYERS = (50, 50, 50)
LR = 0.005
DROPOUT_RATE = 0.2

In [None]:
# Initialize the Qnetwork
network = q_network.QNetwork(
          input_tensor_spec=environment.time_step_spec().observation,
          action_spec=environment.action_spec(),
          fc_layer_params=LAYERS)

# Creating a neuron Epsilon greedy agent with an optimizer, 
# Epsilon exploration value, learning & dropout rate
agent = eps_greedy_agent.NeuralEpsilonGreedyAgent(
  time_step_spec=environment.time_step_spec(),# get the spec/format of the environment
  action_spec=environment.action_spec(), # get the spec/format of the environment
  reward_network=network, #q network goes here
  # optimizer=tf.optimizers.Adam(learning_rate=LR),
  optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=LR), #start w/ adam optimizer with a learning rate of .002
  epsilon=EPSILON) # we recommend an exploration of value of 1%)

In [None]:
# Making functions for computing optimal reward/action and attaching the env variable to it using partial functions, so it doesnt need to be passed with every invocation
optimal_reward_fn = functools.partial(
      environment_utilities.compute_optimal_reward_with_movielens_environment,
      environment=environment)

optimal_action_fn = functools.partial(
      environment_utilities.compute_optimal_action_with_movielens_environment,
      environment=environment)

In [None]:
# Initilializing the regret and suboptimal arms metric using the optimal reward and action functions
regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward_fn)
suboptimal_arms_metric = tf_bandit_metrics.SuboptimalArmsMetric(
      optimal_action_fn)

In [None]:
step_metric = tf_metrics.EnvironmentSteps()
metrics = [tf_metrics.NumberOfEpisodes(),  #equivalent to number of steps in bandits problem
           regret_metric,  # measures regret
           suboptimal_arms_metric,  # number of times the suboptimal arms are pulled
           tf_metrics.AverageReturnMetric(batch_size=environment.batch_size)  # the average return
           ]

In [None]:
STEPS_PER_LOOP = 2
buf = tf_uniform_replay_buffer.TFUniformReplayBuffer(
      data_spec=agent.policy.trajectory_spec,
      batch_size=BATCH_SIZE,
      max_length=STEPS_PER_LOOP)

In [None]:
#TOFINISH: setup the replay observer as a list to capture both metrics, step metrics and provide access to the function to load data from the driver into the buffer
replay_observer = [buf.add_batch, step_metric] + metrics 

driver = dynamic_step_driver.DynamicStepDriver(
      env=environment,
      policy=agent.collect_policy,
      num_steps=STEPS_PER_LOOP * environment.batch_size,
      observers=replay_observer )


In [None]:
AGENT_CHECKPOINT_NAME = 'agent'
STEP_CHECKPOINT_NAME = 'step'
CHECKPOINT_FILE_PREFIX = 'ckpt'

In [None]:
def restore_and_get_checkpoint_manager(root_dir, agent, metrics, step_metric):
    """Restores from `root_dir` and returns a function that writes checkpoints."""
    trackable_objects = {metric.name: metric for metric in metrics}
    trackable_objects[AGENT_CHECKPOINT_NAME] = agent
    trackable_objects[STEP_CHECKPOINT_NAME] = step_metric
    checkpoint = tf.train.Checkpoint(**trackable_objects)
    checkpoint_manager = tf.train.CheckpointManager(checkpoint=checkpoint,
                                                  directory=root_dir,
                                                  max_to_keep=5)
    latest = checkpoint_manager.latest_checkpoint

    if latest is not None:
        print('Restoring checkpoint from %s.', latest)
        checkpoint.restore(latest)
        print('Successfully restored to step %s.', step_metric.result())
    else:
        print('Did not find a pre-existing checkpoint. '
                 'Starting from scratch.')
    return checkpoint_manager

In [None]:
checkpoint_manager = restore_and_get_checkpoint_manager(
  log_path, agent, metrics, step_metric)
saver = policy_saver.PolicySaver(agent.policy)
summary_writer = tf.summary.create_file_writer(log_path)
summary_writer.set_as_default()

In [None]:
AGENT_ALPHA = 10.0
TRAINING_LOOPS = 15000

In [None]:
# # strategy = tf.distribute.MirroredStrategy(devices=["/gpu:0"])
# if tf.config.list_physical_devices('GPU'):
#     strategy = tf.distribute.MirroredStrategy(devices=["/gpu:0"], 
#                                               cross_device_ops=tf.distribute.NcclAllReduce)
#     # strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
# else:
#     strategy = tf.distribute.get_strategy()
# print(strategy)      

In [None]:
import warnings
warnings.filterwarnings('ignore')

# with strategy.scope():
for _ in range(0, 1):
    for i in range(TRAINING_LOOPS):
        driver.run()
        batch_size = driver.env.batch_size
        
        dataset = buf.as_dataset( 
            sample_batch_size = BATCH_SIZE,
            num_steps=STEPS_PER_LOOP,
            single_deterministic_pass=True)

        experience, unused_info = next(iter(dataset))
        
        train_loss = agent.train(experience).loss
        buf.clear()
        metric_utils.log_metrics(metrics)
            # for m in metrics:
            # print(m.name, ": ", m.result())
        for metric in metrics:
            metric.tf_summaries(train_step=step_metric.result())
        checkpoint_manager.save()
        if not i % 1000:  
            print(f"current {i // 1000}")
    
    saver.save(os.path.join(log_path, "./", 'policy_%d' % step_metric.result()))

In [None]:
print("tensorboard dev upload --logdir {} --name \"(optional) My latest experiment\" --description \"(optional) Agent trained\"".format(log_path))

In [None]:
import numpy as np
feature = np.reshape(environment._observe()[0], (1,20))
feature.shape

In [None]:
## Inference
step = ts.TimeStep(
        tf.constant(
            ts.StepType.FIRST, dtype=tf.int32, shape=[1],
            name='step_type'),
        tf.constant(0.0, dtype=tf.float32, shape=[1], name='reward'),
        tf.constant(1.0, dtype=tf.float32, shape=[1], name='discount'),
        tf.constant(feature,
                    dtype=tf.float64, shape=[1, 20],
                    name='observation'))

agent.policy.action(step).action.numpy()