In [2]:
import matplotlib.pyplot as plt
import numpy as np
import random

In [3]:
# returns the probabilities of the actions  
def boltzmann_exploration(qvalues, temperature):
    # `- np.max(qvalues)` is used to prevent nan values 
    exp = np.exp((qvalues - np.max(qvalues)) / temperature)
    return exp / np.sum(exp)

In [4]:
def pd(i: bool, j: bool):
    global num_outcomes
    num_outcomes[int(i) + 2 * int(j)] += 1
    
    reward_table = [[(1, 1), (5, 0)], [(0, 5), (3, 3)]]
    return reward_table[int(i)][int(j)]

# Agent
Each agent has two Q-Tables, one for partner selection (ps) and one for prisoners dilemma (pd).

|  | Defect(0) | Cooperate(1) |
| --- | --- | --- |
| Partner Previously Defected(0) | (0, 0) | (0, 1) |
| Partner Previously Cooperated(1) | (1, 0) | (1, 1) |


In [5]:
class Agent:
    def __init__(self, learning_rate: float, temperature: float, discount_rate: float):
        self.a = learning_rate
        self.t = temperature
        self.g = discount_rate
        self.last_action = bool(random.getrandbits(1))
        self.qvalues_ps = np.zeros((2, 2))
        self.qvalues_pd = np.zeros((2, 2))
        self.rewards = np.zeros((2, 2))
        return

    # get action for partner selection
    # returns true if agent stays and false if agent breaks ties
    def get_action_ps(self, partner_la) -> bool:
        prob = boltzmann_exploration(self.qvalues_ps[int(partner_la)], self.t)
        return bool(np.random.choice(2, p=prob))

    # get action for the prisoner's dilemma game
    # returns true if the agent cooperates and false if the agent defects
    def get_action_pd(self, partner_la) -> bool:
        prob = boltzmann_exploration(self.qvalues_pd[int(partner_la)], self.t)
        return bool(np.random.choice(2, p=prob))

    def update_reward(self, r, pd_a: bool, pla: bool):
        self.rewards[int(pla), int(pd_a)] = r

    # trains the agent using the Q-Learning formula
    def train(self):
        ps_qvalues = np.zeros((2, 2))
        # NOTE: no reward is given in the partner selection stage as indicated in the paper (page 1113, 5th line of 2nd paragraph)
        ps_qvalues[0, 0] = (1 - self.a) * self.qvalues_ps[0, 0] + self.a * (0 + self.g * np.max(self.qvalues_ps))
        ps_qvalues[0, 1] = (1 - self.a) * self.qvalues_ps[0, 1] + self.a * (0 + self.g * np.max(self.qvalues_ps))
        ps_qvalues[1, 0] = (1 - self.a) * self.qvalues_ps[1, 0] + self.a * (0 + self.g * np.max(self.qvalues_ps))
        ps_qvalues[1, 1] = (1 - self.a) * self.qvalues_ps[1, 1] + self.a * (0 + self.g * np.max(self.qvalues_ps))
        
        pd_qvalues = np.zeros((2, 2))
        pd_qvalues[0, 0] = (1 - self.a) * self.qvalues_pd[0, 0] + self.a * (self.rewards[0, 0] + self.g * np.max(self.qvalues_pd))
        pd_qvalues[0, 1] = (1 - self.a) * self.qvalues_pd[0, 1] + self.a * (self.rewards[0, 1] + self.g * np.max(self.qvalues_pd))
        pd_qvalues[1, 0] = (1 - self.a) * self.qvalues_pd[1, 0] + self.a * (self.rewards[1, 0] + self.g * np.max(self.qvalues_pd))
        pd_qvalues[1, 1] = (1 - self.a) * self.qvalues_pd[1, 1] + self.a * (self.rewards[1, 1] + self.g * np.max(self.qvalues_pd))

        self.qvalues_ps = ps_qvalues
        self.qvalues_pd = pd_qvalues
        self.rewards = np.zeros((2, 2))

In [6]:
def sdoo(population: int, rounds: int, episodes: int, learning_rate: float, temperature: float, discount_rate: float):
    agents = [Agent(learning_rate, temperature, discount_rate) for _ in range(population)]
    unpaired = list(agents)
    
    pairs = []
    while unpaired:
        i = unpaired.pop(np.random.randint(len(unpaired)))
        j = unpaired.pop(np.random.randint(len(unpaired)))
        pairs.append((i, j))
    
    for episode in range(episodes):
        for round in range(rounds):
            for pair in pairs:
                i = pair[0]
                j = pair[1]
                la_i = i.last_action
                la_j = j.last_action
                a_i = i.get_action_ps(la_j)
                a_j = j.get_action_ps(la_i)

                if not a_i or not a_j:
                    unpaired.append(i)
                    unpaired.append(j)
                    pairs.remove(pair)
                
            while unpaired:
                i = unpaired.pop(np.random.randint(len(unpaired)))
                j = unpaired.pop(np.random.randint(len(unpaired)))
                pairs.append((i, j))

            for pair in pairs:
                i = pair[0]
                j = pair[1]
                la_i = i.last_action
                la_j = j.last_action
                a_i = i.get_action_pd(la_j)
                a_j = j.get_action_pd(la_i)
                r_i, r_j = pd(a_i, a_j)
                i.last_action = a_i
                j.last_action = a_j
                i.update_reward(r_i, a_i, la_j)
                j.update_reward(r_j, a_j, la_i)
        for agent in agents:
            agent.train()
    
    num_games = population * rounds * episodes / 2
    print("games: %i" % (num_games))
    print("(D, D): %i\t%f%%" % (num_outcomes[0], 100 * num_outcomes[0] / num_games))
    print("(C, D): %i\t%f%%" % (num_outcomes[1], 100 * num_outcomes[1] / num_games))
    print("(D, C): %i\t%f%%" % (num_outcomes[2], 100 * num_outcomes[2] / num_games))
    print("(C, C): %i\t%f%%" % (num_outcomes[3], 100 * num_outcomes[3] / num_games))
    
    strategies = [
        # ("Random-PS", 0, np.array([[1, 1], [1, 1]])),
        ("Always-Stay", 0, np.array([[0, 1], [0, 1]])),
        ("Out-for-Tat", 0, np.array([[1, 0], [0, 1]])),
        ("Reverse-OFT", 0, np.array([[0, 1], [1, 0]])),
        ("Always-Switch", 0, np.array([[1, 0], [1, 0]])),
        # ("Random-PD", 1, np.array([[1, 1], [1, 1]])),
        ("Always-Cooperate", 1, np.array([[0, 1], [0, 1]])),
        ("Tit-for-Tat", 1, np.array([[1, 0], [0, 1]])),
        ("Reverse-TFT", 1, np.array([[0, 1], [1, 0]])),
        ("Always-Defect", 1, np.array([[1, 0], [1, 0]])),
    ]

    # TODO: switch strategy and agent loops for better performance
    for agent in agents:
        print()
        # print(agent.qvalues_ps)
        # print(agent.qvalues_pd)
        
        temp = agent.qvalues_ps
        temp = np.array([boltzmann_exploration(temp[0], temperature), boltzmann_exploration(temp[1], temperature)])
        print(temp)
        temp = agent.qvalues_pd
        temp = np.array([boltzmann_exploration(temp[0], temperature), boltzmann_exploration(temp[1], temperature)])
        print(temp)

        angles = [[], []]
        for strategy in strategies:
            mat = agent.qvalues_pd if strategy[1] == 1 else agent.qvalues_ps
            mat = np.array([boltzmann_exploration(mat[0], temperature), boltzmann_exploration(mat[1], temperature)])
            # print(mat)
            mat2 = strategy[2]
            norm = np.linalg.norm(mat)
            strategy_norm = np.linalg.norm(mat2)
            elm = np.multiply(mat, mat2)
            prod = np.sum(elm)
            angle = np.rad2deg(np.arccos(prod / (norm * strategy_norm)))
            angles[strategy[1]].append(angle)
        
        min_ps = np.argmin(angles[0])
        min_pd = np.argmin(angles[1])
        print("PS-Strategy = %s (%9fdeg)\t\tPD-Strategy = %s (%9fdeg)" % (strategies[min_ps][0], angles[0][min_ps], strategies[min_pd + len(angles[0])][0], angles[1][min_pd]))

In [7]:
num_outcomes = [0, 0, 0, 0]

sdoo(20, 20, 3000, 0.05, 1, 1)

games: 600000
(D, D): 378905	63.150833%
(C, D): 97618	16.269667%
(D, C): 97713	16.285500%
(C, C): 25764	4.294000%

[[0.5 0.5]
 [0.5 0.5]]
[[0.81306622 0.18693378]
 [0.57744365 0.42255635]]
PS-Strategy = Always-Stay (45.000000deg)		PD-Strategy = Always-Defect (26.544527deg)

[[0.5 0.5]
 [0.5 0.5]]
[[0.82301278 0.17698722]
 [0.73742537 0.26257463]]
PS-Strategy = Always-Stay (45.000000deg)		PD-Strategy = Always-Defect (16.287082deg)

[[0.5 0.5]
 [0.5 0.5]]
[[0.84818014 0.15181986]
 [0.70723591 0.29276409]]
PS-Strategy = Always-Stay (45.000000deg)		PD-Strategy = Always-Defect (17.392766deg)

[[0.5 0.5]
 [0.5 0.5]]
[[0.86916047 0.13083953]
 [0.78527841 0.21472159]]
PS-Strategy = Always-Stay (45.000000deg)		PD-Strategy = Always-Defect (12.452972deg)

[[0.5 0.5]
 [0.5 0.5]]
[[0.77916107 0.22083893]
 [0.81720612 0.18279388]]
PS-Strategy = Always-Stay (45.000000deg)		PD-Strategy = Always-Defect (14.309912deg)

[[0.5 0.5]
 [0.5 0.5]]
[[0.75694725 0.24305275]
 [0.7220771  0.2779229 ]]
PS-Strategy