In [49]:
import matplotlib.pyplot as plt
import numpy as np
import random

In [50]:
# returns the probabilities of the actions  
def boltzmann_exploration(qvalues, temperature):
    # `- np.max(qvalues)` is used to prevent nan values 
    exp = np.exp((qvalues - np.max(qvalues)) / temperature)
    return exp / np.sum(exp)

In [51]:
def pd(i: bool, j: bool):
    global num_outcomes
    num_outcomes[int(i) + 2 * int(j)] += 1
    
    reward_table = [[(1, 1), (5, 0)], [(0, 5), (3, 3)]]
    return reward_table[int(i)][int(j)]

# Agent
Each agent has two Q-Tables, one for partner selection (ps) and one for prisoners dilemma (pd).

|  | Defect(0) | Cooperate(1) |
| --- | --- | --- |
| Partner Previously Defected(0) | (0, 0) | (0, 1) |
| Partner Previously Cooperated(1) | (1, 0) | (1, 1) |


In [52]:
class Agent:
    def __init__(self, learning_rate: float, temperature: float, discount_rate: float):
        self.a = learning_rate
        self.t = temperature
        self.g = discount_rate
        self.ps_s = False
        self.ps_a = False
        self.last_action = bool(random.getrandbits(1))
        self.qvalues_ps = np.zeros((2, 2))
        self.qvalues_pd = np.zeros((2, 2))
        self.rewards_ps = np.zeros((2, 2))
        self.rewards_pd = np.zeros((2, 2))
        return

    # get action for partner selection
    # returns true if agent stays and false if agent breaks ties
    def get_action_ps(self, partner_la) -> bool:
        prob = boltzmann_exploration(self.qvalues_ps[int(partner_la)], self.t)
        return bool(np.random.choice(2, p=prob))

    # get action for the prisoner's dilemma game
    # returns true if the agent cooperates and false if the agent defects
    def get_action_pd(self, partner_la) -> bool:
        prob = boltzmann_exploration(self.qvalues_pd[int(partner_la)], self.t)
        return bool(np.random.choice(2, p=prob))

    def update_reward(self, ps_s: bool, ps_a: bool, pd_s: bool, pd_a: bool, r):
        self.rewards_ps[int(ps_s), int(ps_a)] += r
        self.rewards_pd[int(pd_s), int(pd_a)] += r

    # trains the agent using the Q-Learning formula
    def train(self):
        ps_qvalues = np.zeros((2, 2))
        ps_qvalues[0, 0] = (1 - self.a) * self.qvalues_ps[0, 0] + self.a * (self.rewards_ps[0, 0] + self.g * np.max(self.qvalues_ps))
        ps_qvalues[0, 1] = (1 - self.a) * self.qvalues_ps[0, 1] + self.a * (self.rewards_ps[0, 1] + self.g * np.max(self.qvalues_ps))
        ps_qvalues[1, 0] = (1 - self.a) * self.qvalues_ps[1, 0] + self.a * (self.rewards_ps[1, 0] + self.g * np.max(self.qvalues_ps))
        ps_qvalues[1, 1] = (1 - self.a) * self.qvalues_ps[1, 1] + self.a * (self.rewards_ps[1, 1] + self.g * np.max(self.qvalues_ps))
        
        pd_qvalues = np.zeros((2, 2))
        pd_qvalues[0, 0] = (1 - self.a) * self.qvalues_pd[0, 0] + self.a * (self.rewards_pd[0, 0] + self.g * np.max(self.qvalues_pd))
        pd_qvalues[0, 1] = (1 - self.a) * self.qvalues_pd[0, 1] + self.a * (self.rewards_pd[0, 1] + self.g * np.max(self.qvalues_pd))
        pd_qvalues[1, 0] = (1 - self.a) * self.qvalues_pd[1, 0] + self.a * (self.rewards_pd[1, 0] + self.g * np.max(self.qvalues_pd))
        pd_qvalues[1, 1] = (1 - self.a) * self.qvalues_pd[1, 1] + self.a * (self.rewards_pd[1, 1] + self.g * np.max(self.qvalues_pd))

        self.qvalues_ps = ps_qvalues
        self.qvalues_pd = pd_qvalues
        self.rewards_ps = np.zeros((2, 2))
        self.rewards_pd = np.zeros((2, 2))

In [53]:
def sdoo(population: int, rounds: int, episodes: int, learning_rate: float, temperature: float, discount_rate: float):
    agents = [Agent(learning_rate, temperature, discount_rate) for _ in range(population)]
    unpaired = list(agents)
    
    pairs = []
    while unpaired:
        i = unpaired.pop(np.random.randint(len(unpaired)))
        j = unpaired.pop(np.random.randint(len(unpaired)))
        pairs.append((i, j))
    
    for episode in range(episodes):
        for round in range(rounds):
            for pair in pairs:
                i = pair[0]
                j = pair[1]
                la_i = i.last_action
                la_j = j.last_action
                i.ps_s = la_j
                j.ps_s = la_i
                a_i = i.get_action_ps(la_j)
                a_j = j.get_action_ps(la_i)
                i.ps_a = a_i
                j.ps_a = a_j

                if not a_i or not a_j:
                    unpaired.append(i)
                    unpaired.append(j)
                    pairs.remove(pair)
                
            while unpaired:
                i = unpaired.pop(np.random.randint(len(unpaired)))
                j = unpaired.pop(np.random.randint(len(unpaired)))
                pairs.append((i, j))

            for pair in pairs:
                i = pair[0]
                j = pair[1]
                la_i = i.last_action
                la_j = j.last_action
                a_i = i.get_action_pd(la_j)
                a_j = j.get_action_pd(la_i)
                r_i, r_j = pd(a_i, a_j)
                i.last_action = a_i
                j.last_action = a_j
                i.update_reward(i.ps_s, i.ps_a, la_j, a_i, r_i)
                j.update_reward(j.ps_s, j.ps_a, la_i, a_j, r_j)

        for agent in agents:
            agent.train()
    
    num_games = population * rounds * episodes / 2
    print("games: %i" % (num_games))
    print("(D, D): %i\t%f%%" % (num_outcomes[0], 100 * num_outcomes[0] / num_games))
    print("(C, D): %i\t%f%%" % (num_outcomes[1], 100 * num_outcomes[1] / num_games))
    print("(D, C): %i\t%f%%" % (num_outcomes[2], 100 * num_outcomes[2] / num_games))
    print("(C, C): %i\t%f%%" % (num_outcomes[3], 100 * num_outcomes[3] / num_games))
    
    strategies = [
        # ("Random-PS", 0, np.array([[1, 1], [1, 1]])),
        ("Always-Stay", 0, np.array([[0, 1], [0, 1]])),
        ("Out-for-Tat", 0, np.array([[1, 0], [0, 1]])),
        ("Reverse-OFT", 0, np.array([[0, 1], [1, 0]])),
        ("Always-Switch", 0, np.array([[1, 0], [1, 0]])),
        # ("Random-PD", 1, np.array([[1, 1], [1, 1]])),
        ("Always-Cooperate", 1, np.array([[0, 1], [0, 1]])),
        ("Tit-for-Tat", 1, np.array([[1, 0], [0, 1]])),
        ("Reverse-TFT", 1, np.array([[0, 1], [1, 0]])),
        ("Always-Defect", 1, np.array([[1, 0], [1, 0]])),
    ]

    # TODO: switch strategy and agent loops for better performance
    for agent in agents:
        print()
        # print(agent.qvalues_ps)
        # print(agent.qvalues_pd)
        
        temp = agent.qvalues_ps
        # temp = np.array([boltzmann_exploration(temp[0], temperature), boltzmann_exploration(temp[1], temperature)])
        print(temp)
        temp = agent.qvalues_pd
        # temp = np.array([boltzmann_exploration(temp[0], temperature), boltzmann_exploration(temp[1], temperature)])
        print(temp)

        angles = [[], []]
        for strategy in strategies:
            mat = agent.qvalues_pd if strategy[1] == 1 else agent.qvalues_ps
            # mat = np.array([boltzmann_exploration(mat[0], temperature), boltzmann_exploration(mat[1], temperature)])
            # print(mat)
            mat2 = strategy[2]
            norm = np.linalg.norm(mat)
            strategy_norm = np.linalg.norm(mat2)
            elm = np.multiply(mat, mat2)
            prod = np.sum(elm)
            angle = np.rad2deg(np.arccos(prod / (norm * strategy_norm)))
            angles[strategy[1]].append(angle)
        
        min_ps = np.argmin(angles[0])
        min_pd = np.argmin(angles[1])
        print("PS-Strategy = %s (%9fdeg)\t\tPD-Strategy = %s (%9fdeg)" % (strategies[min_ps][0], angles[0][min_ps], strategies[min_pd + len(angles[0])][0], angles[1][min_pd]))

In [54]:
num_outcomes = [0, 0, 0, 0]

sdoo(20, 20, 3000, 0.05, 1, 1)

games: 600000
(D, D): 599354	99.892333%
(C, D): 239	0.039833%
(D, C): 263	0.043833%
(C, C): 144	0.024000%

[[2979.00704375 2999.00704375]
 [2979.00704375 2979.00704375]]
[[3001.45 2981.45]
 [2981.45 2981.45]]
PS-Strategy = Always-Stay (44.904157deg)		PD-Strategy = Tit-for-Tat (44.904235deg)

[[2977.79553844 2997.79553844]
 [2977.79553844 2977.79553844]]
[[2999.94063048 2979.94063048]
 [2979.94063048 2979.94063048]]
PS-Strategy = Always-Stay (44.904118deg)		PD-Strategy = Tit-for-Tat (44.904187deg)

[[2999.386275 2979.386275]
 [2979.386275 2979.386275]]
[[3000.30251875 2980.30251875]
 [2980.30251875 2980.30251875]]
PS-Strategy = Out-for-Tat (44.904169deg)		PD-Strategy = Tit-for-Tat (44.904198deg)

[[2980.40728125 3000.40728125]
 [2980.40728125 2980.40728125]]
[[3000.24139375 2980.24139375]
 [2980.24139375 2980.24139375]]
PS-Strategy = Always-Stay (44.904202deg)		PD-Strategy = Tit-for-Tat (44.904196deg)

[[2995.50215625 2975.50215625]
 [2975.50215625 2975.50215625]]
[[2997.421875 2977.421