# IMPORT LIBRARIES & DATA

In [1]:
import sys
sys.path.append('../src') 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from GA import solve
from typing import List, Tuple, Dict

In [2]:
class RLGA_mTSP:
    def __init__(self, distance_matrix: np.ndarray, m: int = 3,
                 n_population: int = 100, max_iterations: int = 1000,
                 epsilon: float = 0.1, epsilon_decay: float = 0.995,
                 n_states: int = 10):
        # --- Thông tin đầu vào và khởi tạo tham số ---
        self.distance_matrix = distance_matrix
        self.n_cities = len(distance_matrix)
        self.m = m
        self.N = n_population
        self.Max = max_iterations
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay

        # --- Không gian hành động RL ---
        self.Cs = np.linspace(0.1, 0.9, 9)  # Crossover rates
        self.Ms = np.linspace(0.01, 0.1, 10)  # Mutation rates
        self.S = list(range(n_states))  # Dummy state set (không dùng trực tiếp)
        self.Ns = n_states
        self.Nt = 10  # Ngưỡng số hành động để chia pha học RL

        # --- Q-table và lịch sử trạng thái ---
        self.Q = {}
        self.state_history = []
        self.initialize_q_table()  # Đặt sẵn hàm tạo Q-table động

        # --- Tham số hiện tại ---
        self.current_state = 0
        self.stagnation_counter = 0
        self.Pc = random.choice(self.Cs)
        self.Pm = random.choice(self.Ms)

        # --- Khởi tạo quần thể ---
        self.population = self._initialize_population()

In [None]:
#Gọi local search
def local_search(self, route):
    best = route[:]
    best_routes = self.tsp_split_dp(best)
    best_fitness = max(self.calculate_route_distance(r) for r in best_routes)

    for _ in range(10):
        i, j = random.sample(range(len(route)), 2)
        new = best[:]
        new[i], new[j] = new[j], new[i]
        new_routes = self.tsp_split_dp(new)
        new_fitness = max(self.calculate_route_distance(r) for r in new_routes)
        if new_fitness < best_fitness:
            best = new
            best_fitness = new_fitness

    return best

In [3]:
    def _initialize_population(self) -> List[List[int]]:
        # Khởi tạo quần thể các cá thể ngẫu nhiên (không bao gồm depot)
        population = []
        for _ in range(self.N):
            route = list(range(1, self.n_cities))
            random.shuffle(route)
            population.append(route)
        return population

    def calculate_route_distance(self, route: List[int]) -> float:
        # Tính tổng quãng đường của tuyến đi qua các thành phố
        return sum(self.distance_matrix[route[i]][route[i + 1]] for i in range(len(route) - 1))

    def tsp_split_dp(self, route: List[int]) -> List[List[int]]:
        # --- Tách tuyến bằng dynamic programming ---
        n = len(route)
        dp = [[float('inf')] * (self.m + 1) for _ in range(n + 1)]
        path = [[-1] * (self.m + 1) for _ in range(n + 1)]
        dp[0][0] = 0

        for i in range(1, n + 1):
            for k in range(1, self.m + 1):
                for j in range(i):
                    sub_route = [0] + route[j:i] + [0]
                    cost = self.calculate_route_distance(sub_route)
                    if max(dp[j][k - 1], cost) < dp[i][k]:
                        dp[i][k] = max(dp[j][k - 1], cost)
                        path[i][k] = j

        # --- Truy vết tuyến tối ưu ---
        routes = []
        i, k = n, self.m
        while k > 0:
            j = path[i][k]
            routes.append([0] + route[j:i] + [0])
            i, k = j, k - 1

        return routes[::-1]

    def calculate_fitness(self, individual: List[int]) -> Tuple[float, float, List[List[int]]]:
        # Tính fitness của cá thể: max_distance, total_distance, các tuyến
        routes = self.tsp_split_dp(individual)
        distances = [self.calculate_route_distance(route) for route in routes]
        return max(distances), sum(distances), routes

    def initialize_q_table(self):
        pass  # Q-table sẽ khởi tạo khi gặp state mới

    def get_state(self, fitness_scores: List[Tuple[float, float]], iteration: int) -> str:
        # --- Biểu diễn trạng thái theo đa yếu tố (diversity, improvement, convergence) ---
        max_distances = [f[0] for f in fitness_scores]
        diversity = np.std(max_distances)
        improvement_rate = 0

        if len(self.state_history) > 0:
            prev_best = min(self.state_history[-1])
            current_best = min(max_distances)
            improvement_rate = (prev_best - current_best) / (prev_best + 1e-6)

        diversity_level = min(int(diversity * 10), 9)
        improvement_level = max(0, min(int(improvement_rate * 100), 9))
        convergence_level = min(int(iteration / self.Max * 10), 9)

        state = f"div_{diversity_level}_imp_{improvement_level}_conv_{convergence_level}"
        self.state_history.append(max_distances)

        if state not in self.Q:
            self.Q[state] = {}
            for pc in self.Cs:
                for pm in self.Ms:
                    self.Q[state][(pc, pm)] = 0.0

        return state

In [6]:
    def select_action_epsilon_greedy(self, state: str) -> Tuple[float, float]:
        # --- Chính sách chọn hành động epsilon-greedy ---
        if random.random() < self.epsilon:
            return random.choice(self.Cs), random.choice(self.Ms)
        else:
            return max(self.Q[state].items(), key=lambda x: x[1])[0]

    def update_q_value(self, state: str, action: Tuple[float, float], reward: float, next_state: str, iteration: int):
        # --- Cập nhật Q-value theo Q-learning ---
        alpha, gamma = 0.1, 0.9
        current_q = self.Q[state][action]
        next_max_q = max(self.Q[next_state].values()) if next_state in self.Q else 0

        if (self.Nt - self.Ns) / 2 > iteration:
            self.Q[state][action] = current_q + alpha * (reward - current_q)
        else:
            self.Q[state][action] = current_q + alpha * (reward + gamma * next_max_q - current_q)

    def tournament_selection(self, population: List[List[int]], fitness_scores: List[Tuple[float, float]], k: int = 3) -> List[int]:
        # --- Chọn lọc theo giải đấu ---
        indices = random.sample(range(len(population)), k)
        winner_idx = min(indices, key=lambda i: fitness_scores[i][0])
        return population[winner_idx]

    def crossover(self, parent1: List[int], parent2: List[int]) -> Tuple[List[int], List[int]]:
        # --- Lai ghép OX ---
        if random.random() > self.Pc:
            return parent1.copy(), parent2.copy()

        size = len(parent1)
        a, b = sorted(random.sample(range(size), 2))

        def ox(p1, p2):
            child = [-1] * size
            child[a:b + 1] = p1[a:b + 1]
            fill = [x for x in p2 if x not in child]
            j = 0
            for i in range(size):
                if child[i] == -1:
                    child[i] = fill[j]
                    j += 1
            return child

        return ox(parent1, parent2), ox(parent2, parent1)

    def mutate(self, route: List[int]) -> List[int]:
        # --- Đột biến hoán vị 2 điểm ---
        route = route.copy()
        if random.random() < self.Pm:
            i, j = random.sample(range(len(route)), 2)
            route[i], route[j] = route[j], route[i]
        return route

    def run(self) -> Tuple[List[List[int]], float, float]:
        # --- Vòng lặp chính của RLGA ---
        child1 = self.local_search(child1)
        child2 = self.local_search(child2)

        best_solution, best_max_distance, best_total_distance = None, float('inf'), float('inf')
        t = 0

        while t < self.Max:
            # --- Đánh giá quần thể hiện tại ---
            fitness_scores = [self.calculate_fitness(ind)[:2] for ind in self.population]
            current_state = self.get_state(fitness_scores, t)
            self.Pc, self.Pm = self.select_action_epsilon_greedy(current_state)

            # --- Tạo quần thể mới ---
            new_population = []
            while len(new_population) < self.N:
                p1 = self.tournament_selection(self.population, fitness_scores)
                p2 = self.tournament_selection(self.population, fitness_scores)
                c1, c2 = self.crossover(p1, p2)
                new_population.extend([self.mutate(c1), self.mutate(c2)])

            self.population = new_population[:self.N]
            new_fitness_scores = [self.calculate_fitness(ind) for ind in self.population]

            # --- Cập nhật lời giải tốt nhất ---
            for i, (max_d, total_d, routes) in enumerate(new_fitness_scores):
                if max_d < best_max_distance:
                    best_max_distance, best_total_distance = max_d, total_d
                    best_solution = routes

            # --- Tính reward và cập nhật Q ---
            old_best = min([f[0] for f in fitness_scores])
            new_best = min([f[0] for f in new_fitness_scores])
            reward = (old_best - new_best) / (old_best + 1e-6)

            next_state = self.get_state([f[:2] for f in new_fitness_scores], t + 1)
            self.update_q_value(current_state, (self.Pc, self.Pm), reward, next_state, t)

            # --- Điều chỉnh epsilon ---
            self.epsilon *= self.epsilon_decay
            t += 1

            # --- Ghi log ---
            if t % 100 == 0:
                print(f"Iteration {t}: Best max distance = {best_max_distance:.2f}, Pc = {self.Pc:.2f}, Pm = {self.Pm:.2f}, state = {current_state}")

        return best_solution, best_max_distance, best_total_distance