In [1]:
import gymnasium as gym
import numpy as np
from qiskit import QuantumCircuit, transpile
from qiskit_aer import AerSimulator

In [2]:
# Environment
env = gym.make("FrozenLake-v1", is_slippery=False, map_name="4x4")

In [3]:
# Q-Learning parameters
n_actions = env.action_space.n
n_states = env.observation_space.n
q_table = np.zeros((n_states, n_actions))

learning_rate = 0.1
discount_factor = 0.95
epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.995
episodes = 500

In [4]:
# Quantum backend
backend = AerSimulator()

def quantum_reward_modulation():
    """Quantum circuit to modulate rewards."""
    qc = QuantumCircuit(1, 1)
    # Random angles for a single-qubit unitary
    theta = np.random.uniform(0, np.pi)
    phi = np.random.uniform(0, np.pi)
    lam = np.random.uniform(0, np.pi)
    
    qc.u(theta, phi, lam, 0)
    qc.measure(0, 0)

    # Transpile and run the circuit
    transpiled_qc = transpile(qc, backend)
    job = backend.run(transpiled_qc, shots=100)
    result = job.result()
    counts = result.get_counts()

    # Calculate probability of measuring '0'
    p0 = counts.get('0', 0) / 100
    return p0  # Returns a value in [0, 1]

In [5]:
# Training loop
for episode in range(episodes):
    state = env.reset()[0]  # Extract the state from the reset tuple
    done = False
    truncated = False

    while not done and not truncated:
        # Exploration vs Exploitation
        if np.random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table[state, :])

        # Step in the environment
        new_state, reward, done, truncated, info = env.step(action)

        # Quantum reward modifier
        modifier = quantum_reward_modulation()
        modified_reward = reward * (1 + 0.1 * modifier)  # Slight reward scaling

        # Q-Learning Update
        q_table[state, action] += learning_rate * (
            modified_reward + discount_factor * np.max(q_table[new_state, :]) - q_table[state, action]
        )

        state = new_state

    # Decay exploration
    epsilon = max(min_epsilon, epsilon * decay_rate)

    # Optional: Print progress every 100 episodes
    if (episode + 1) % 100 == 0:
        print(f"Episode {episode + 1}/{episodes}, Epsilon: {epsilon:.4f}")

Episode 100/500, Epsilon: 0.6058
Episode 200/500, Epsilon: 0.3670


KeyboardInterrupt: 

In [None]:
print("Training finished.\n")
print("Q-table:")
print(q_table)

In [None]:
# Optional: Evaluate the learned policy
def evaluate_policy(n_eval_episodes=100):
    successes = 0
    for _ in range(n_eval_episodes):
        state = env.reset()[0]  # Extract the state
        done = False
        truncated = False
        while not done and not truncated:
            action = np.argmax(q_table[state, :])
            state, reward, done, truncated, _ = env.step(action)
            if reward == 1.0:  # Reached the goal
                successes += 1
                break
    print(f"\nEvaluation: Success rate = {successes / n_eval_episodes * 100:.2f}%")

In [None]:
evaluate_policy()