In [None]:
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim

# Define the TSP environment
class TSPEnvironment:
  def __init__(self, num_cities=5):
    self.cities = np.random.rand(num_cities, 2)
    self.num_cities = len(self.cities)
    self.reset()

  def reset(self):
    self.visited = [False] * self.num_cities
    self.current_city = 0
    self.total_tour_length = 0

  def get_state(self):
    state = [int(self.visited[i]) for i in range(self.num_cities)]
    return np.array(state)

  def step(self, action):
    next_city = action
    if not self.visited[next_city]:
      distance = self._calculate_distance(self.current_city, next_city)
      self.total_tour_length += distance
      self.current_city = next_city
      self.visited[next_city] = True
    else:
      distance = 0  # Additional negative reward for revisiting the city
    done = all(self.visited)
    return self.get_state(), -distance, done

  def _calculate_distance(self, city1, city2):
    x1, y1 = self.cities[city1]
    x2, y2 = self.cities[city2]
    return np.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2)

In [None]:
class QLearningAgent:
  def __init__(self, num_states, num_actions, learning_rate=0.1, gamma=0.9, exploration_prob=1):
    self.num_states = num_states
    self.num_actions = num_actions
    self.learning_rate = learning_rate
    self.gamma = gamma
    self.exploration_prob = exploration_prob
    self.q_table = np.zeros((num_states, num_actions))

  def choose_action(self, state):
    if random.random() < self.exploration_prob:
      return random.choice(range(self.num_actions))
    else:
      return np.argmax(self.q_table[state])

  def train(self, state, action, reward, next_state, done):
    next_q_max = np.max(self.q_table[next_state])
    target_q = reward + self.gamma * next_q_max * (1 - done)
    self.q_table[state, action] = (1 - self.learning_rate) * self.q_table[state, action] + \
                                  self.learning_rate * target_q


In [None]:
# Define a function to plot the tour
import matplotlib.pyplot as plt
def plot_tour(cities, tour, title):
  x = [cities[i][0] for i in tour + [tour[0]]]
  y = [cities[i][1] for i in tour + [tour[0]]]
  plt.figure(figsize=(4, 4))
  plt.plot(x, y, marker='o', linestyle='-')
  plt.scatter(x, y, c='red', marker='x')
  for i, city in enumerate(cities):
    plt.text(city[0] + 0.1, city[1] + 0.1, f"{i}", fontsize=10, color='yellow')
  plt.title(title)
  plt.xlabel("X")
  plt.ylabel("Y")
  plt.grid(True)
  plt.show()

In [None]:
# Main training loop
def main():

  num_cities = 5
  env = TSPEnvironment(num_cities=num_cities)
  agent = QLearningAgent(num_states=num_cities, num_actions=num_cities)
  num_episodes = 10

  for episode in range(num_episodes):
    env.reset()
    state = env.get_state()
    print(state)
    done = False
    total_reward = 0
    tour = []  # Initialize an empty tour for the episode
    tour.append(state)
    while not all(env.visited[i] for i in range(num_cities)):
      action = agent.choose_action(state)
      print(action)
      next_state, reward, done = env.step(action)
      agent.train(state, action, reward, next_state, done)
      state = next_state
      total_reward = total_reward +  reward

      tour.append(env.current_city)
      #print(tour)
    print(f"Episode {episode + 1}, Total Tour Length: {total_reward:.2f}")
      #plot_tour(env.cities, tour, f"Episode {episode + 1}")
      #tour = []
    #print(tour)

if __name__ == "__main__":
    main()

[0 0 0 0 0]
2
0
0
2
0
1
2
3
0
2
4
Episode 1, Total Tour Length: -2.30
[0 0 0 0 0]
0
4
3
2
3
4
0
1
Episode 2, Total Tour Length: -2.08
[0 0 0 0 0]
0
3
0
0
3
1
4
1
0
3
2
Episode 3, Total Tour Length: -2.72
[0 0 0 0 0]
4
4
4
4
2
4
1
2
4
4
1
3
4
2
2
0
Episode 4, Total Tour Length: -2.78
[0 0 0 0 0]
1
0
0
1
4
0
0
0
0
0
2
4
4
0
2
4
1
1
1
0
0
3
Episode 5, Total Tour Length: -2.07
[0 0 0 0 0]
2
4
3
0
3
0
0
2
2
4
1
Episode 6, Total Tour Length: -2.76
[0 0 0 0 0]
0
0
0
3
3
3
2
4
0
1
Episode 7, Total Tour Length: -2.61
[0 0 0 0 0]
4
3
0
2
0
3
0
1
Episode 8, Total Tour Length: -2.64
[0 0 0 0 0]
3
0
4
2
0
0
0
0
0
0
1
Episode 9, Total Tour Length: -2.93
[0 0 0 0 0]
2
4
3
4
1
4
4
3
1
0
Episode 10, Total Tour Length: -2.60
