In [None]:
from mastermind import Mastermind
from q_learning import QLearning

import random
import numpy as np
from itertools import combinations, combinations_with_replacement, product
import matplotlib.pyplot as plt

In [None]:
def plot_learning_curve(episode_returns, title):
    n_agents = episode_returns.shape[1]
    fig,ax1 = plt.subplots(1,1)
    fig.suptitle(title)
    ax1.set_title('Mean undiscounted return of {} agents'.format(n_agents))
    ax1.set_xlabel('Episode')
    ax1.set_ylabel('Return')
    ax1.grid()

    ax1.plot(np.mean(episode_returns,axis=1),color='k');
    fig.tight_layout()

In [None]:
# env = Mastermind(3,4,10,None) # This works!!

In [None]:
env = Mastermind(4,6,12,None)   # n_pegs, n_colours, n_rows
codes = env.actions
k = 20
codes = random.sample(codes,k=k)
env = Mastermind(4,6,12,codes)

In [None]:
def test_policy(agent, env, n_episodes, codes=None, render = False):
    test_returns = []
    for _ in range(n_episodes):
            if codes:
                code = random.choice(codes)
            else:
                code = None
            cumulative_reward=0
            state = env.reset(code)
            state_hash = state.tobytes()
            if render:
                env.render()
            terminal = False
            while not terminal:
                action = agent.policy(state_hash, test=True)
                next_state, reward, terminal, _ = env.step(list(action))
                if render:
                    env.render()
                next_state_hash = next_state.tobytes()
                state = next_state
                state_hash = next_state_hash
                cumulative_reward+=reward
            test_returns.append(cumulative_reward)
    return test_returns


In [None]:
def q_control(env,gamma=0.9,eps=0.15,alpha=0.2,n_agents=20,n_episodes=150, n_tests = 1000, codes = None):
    returns = np.zeros((n_episodes, n_agents))
    test_returns = np.zeros((n_agents,n_tests))
    for i in range(n_agents):
        agent = QLearning(env,gamma,eps,alpha)        
        for episode in range(n_episodes):
            if codes:
                code = random.choice(codes)
            else:
                code = None
            cumulative_reward=0
            state = env.reset(code)
            state_hash = state.tobytes()
            terminal = False
            while not terminal:
                action = agent.policy(state_hash)
                next_state, reward, terminal, _ = env.step(list(action))
                next_state_hash = next_state.tobytes()
                agent.update_q_table(state_hash, action, reward, next_state_hash)
                state = next_state
                state_hash = next_state_hash
                cumulative_reward+=reward
            returns[episode][i] = cumulative_reward
        test_returns[i] = test_policy(agent,env,n_tests, False)
    return returns, test_returns


In [None]:
returns, test_performance = q_control(env=env,n_agents=20, alpha=0.1, eps=0.15, n_episodes = 100000, codes=None)
plot_learning_curve(returns,f'Number of codes: {len(env.actions)}')
print(f"Average performance in evaluation: {np.mean(test_performance)}")

In [None]:
# for i,code in enumerate(codes):
#     agent, returns = q_control(env=env,n_agents=10, alpha=0.8, eps=0.1, n_episodes = 1000, codes = [codes[i]])
#     plot_learning_curve(returns,f"Codes: {codes[i]}")

In [None]:
# for i,code in enumerate(codes):
#     agent, returns = q_control(env=env,n_agents=10, alpha=0.8, n_episodes = 1000, codes = [codes[i],list(reversed(codes[i]))])
#     plot_learning_curve(returns,f"Codes: {codes[i]}, {list(reversed(codes[i]))}")

In [None]:
# code_pairs = [list(code_pair) for code_pair in combinations(codes,2)]
# print(code_pairs)

In [None]:
# for i,code in enumerate(code_pairs):
#     agent, returns = q_control(env=env,n_agents=10, alpha=0.8, eps=0.1, n_episodes = 1000, codes = code_pairs[i])
#     plot_learning_curve(returns,f"Codes: {code_pairs[i][0]}, {code_pairs[i][1]}")

In [None]:
# code_triples = [list(code_triple) for code_triple in combinations(codes,3)]

# for i,code in enumerate(code_triples):
#     agent, returns = q_control(env=env,n_agents=10, alpha=0.8, eps=0.1, n_episodes = 1000, codes = code_triples[i])
#     plot_learning_curve(returns,f"Codes: {code_triples[i][0]}, {code_triples[i][1]}, {code_triples[i][2]}")
    