<a href="https://colab.research.google.com/github/swaroopkasaraneni/DatasciencePython/blob/main/ReinforcementLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import tensorflow as tf
import gym
import numpy as np
import random

# Define a custom gym environment
class DeliveryEnv(gym.Env):
    def __init__(self, locations, distance_matrix):
        super(DeliveryEnv, self).__init__()
        self.locations = locations
        self.distance_matrix = distance_matrix
        self.num_locations = len(locations)

        # Action and observation spaces
        self.action_space = gym.spaces.Discrete(self.num_locations)  # Choose the next location
        self.observation_space = gym.spaces.Box(
            low=0, high=self.num_locations - 1, shape=(1,), dtype=np.int32
        )

        self.reset()

    def reset(self):
        self.current_location = random.randint(0, self.num_locations - 1)
        self.visited = set()
        self.visited.add(self.current_location)
        self.total_cost = 0
        return np.array([self.current_location], dtype=np.int32)

    def step(self, action):
        if action in self.visited or action == self.current_location:
            reward = -10  # Penalize revisiting or invalid actions
        else:
            cost = self.distance_matrix[self.current_location][action]
            reward = -cost
            self.visited.add(action)
            self.total_cost += cost

        self.current_location = action

        # Check if all locations have been visited
        done = len(self.visited) == self.num_locations
        return np.array([self.current_location], dtype=np.int32), reward, done, {}

    def render(self):
        print(f"Current Location: {self.current_location}, Visited: {self.visited}")

# Example distance matrix
locations = ["A", "B", "C"]
distance_matrix = [
    [0, 2, 5],
    [2, 0, 3],
    [5, 3, 0],

]

env = DeliveryEnv(locations, distance_matrix)

# Hyperparameters
learning_rate = 0.01
gamma = 0.99  # Discount factor

# Neural Network Model
class PolicyGradientModel(tf.keras.Model):
    def __init__(self, num_actions):
        super(PolicyGradientModel, self).__init__()
        self.hidden = tf.keras.layers.Dense(128, activation='relu')
        self.logits = tf.keras.layers.Dense(num_actions)

    def call(self, inputs):
        x = self.hidden(inputs)
        return self.logits(x)

model = PolicyGradientModel(num_actions=env.action_space.n)
optimizer = tf.keras.optimizers.Adam(learning_rate)

# Function to get action probabilities
def get_action_probs(model, state):
    logits = model(tf.convert_to_tensor(state, dtype=tf.float32))
    return tf.nn.softmax(logits)

# Training Loop
def train_model(env, model, episodes=1000):
    all_rewards = []
    for episode in range(episodes):
        state = env.reset()
        states, actions, rewards = [], [], []
        done = False

        while not done:
            state_onehot = tf.one_hot(state, depth=env.action_space.n)
            action_probs = get_action_probs(model, state_onehot)
            action = np.random.choice(env.action_space.n, p=action_probs.numpy()[0])

            next_state, reward, done, _ = env.step(action)

            states.append(state_onehot)
            actions.append(action)
            rewards.append(reward)

            state = next_state

        # Compute discounted rewards
        discounted_rewards = []
        cumulative_reward = 0
        for r in reversed(rewards):
            cumulative_reward = r + gamma * cumulative_reward
            discounted_rewards.insert(0, cumulative_reward)

        # Update model
        with tf.GradientTape() as tape:
            state_tensor = tf.concat(states, axis=0)
            logits = model(state_tensor)
            action_masks = tf.one_hot(actions, env.action_space.n)
            log_probs = tf.reduce_sum(action_masks * tf.nn.log_softmax(logits), axis=1)
            loss = -tf.reduce_mean(log_probs * discounted_rewards)

        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        all_rewards.append(sum(rewards))
        print(f"Episode {episode + 1}: Total Reward: {sum(rewards)}")

    return all_rewards

# Train the policy gradient model
rewards = train_model(env, model, episodes=50)

# Evaluate the trained model
state = env.reset()
done = False
path = [state[0]]

while not done:
    state_onehot = tf.one_hot(state, depth=env.action_space.n)
    action_probs = get_action_probs(model, state_onehot)
    action = np.argmax(action_probs.numpy()[0])
    state, _, done, _ = env.step(action)
    path.append(state[0])

print("Optimal Path:", [locations[i] for i in path])


Episode 1: Total Reward: -65
Episode 2: Total Reward: -18
Episode 3: Total Reward: -25
Episode 4: Total Reward: -127
Episode 5: Total Reward: -28
Episode 6: Total Reward: -5
Episode 7: Total Reward: -15
Episode 8: Total Reward: -18
Episode 9: Total Reward: -8
Episode 10: Total Reward: -48
Episode 11: Total Reward: -45
Episode 12: Total Reward: -27
Episode 13: Total Reward: -27
Episode 14: Total Reward: -35
Episode 15: Total Reward: -38
Episode 16: Total Reward: -38
Episode 17: Total Reward: -35
Episode 18: Total Reward: -68
Episode 19: Total Reward: -78
Episode 20: Total Reward: -5
Episode 21: Total Reward: -98
Episode 22: Total Reward: -8
Episode 23: Total Reward: -17
Episode 24: Total Reward: -27
Episode 25: Total Reward: -28
Episode 26: Total Reward: -47
Episode 27: Total Reward: -18
Episode 28: Total Reward: -8
Episode 29: Total Reward: -17
Episode 30: Total Reward: -7
Episode 31: Total Reward: -5
Episode 32: Total Reward: -8
Episode 33: Total Reward: -8
Episode 34: Total Reward: -

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.


KeyboardInterrupt

