# Task 1:


One application of a Markov Decision Process would be learning to drive a car. The state space would be the car’s position, its velocity, and the environment. The action would be what acceleration (direction and magnitude) to apply. The rewards would be determined by whether the car drives cleanly, avoid all obstacles, stays within the lines of the road, etc.

# Task 2:

One task where Reinforcement Learning is used is stock trading.  RL models can be trained to determine when to buy or when to sell a stock. The abundance of available data makes this a good task for RL. There are tens of thousands of public stocks. There exists easily accessible data for all stocks listed on the major exchanges going back to the 1980s at a variety of resolutions. Additionally, stock prices over time are traditionally modeled as being independent of previous values, which makes this a good problem to be modelled by a Markov Decision Process. 

Trading-bot is a project from pskrunner14 that uses Deep Q-Learning to train a model to make money in the stock market. The model is fairly simple. The state space is the stock price over the last n-days. The action space is whether to buy, sell, or hold the stock. The reward calculation is simply the change in the value of the portfolio from the action taken.  


# Task 3:

In [1]:
#import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count
from PIL import Image
from torch import Tensor
import gymnasium as gym
from tqdm.notebook import tqdm
from tabulate import tabulate

In [2]:
class TicTacToe:
    def __init__(self):
        self.board = torch.zeros((3,3))
        self.turnX = True
        self.moves = 0
    def placeX(self,x,y):
        if (self.board[x,y] != 0):
            return False
        self.board[x,y] = 1
        self.turnX = False
        self.moves+= 1
        return True
    def placeO(self,x,y):
        if (self.board[x,y] != 0):
            return False
        self.board[x,y] = -1
        self.turnX = True
        self.moves+= 1
        return True
    def place(self, x,y):
        if self.turnX:
            return self.placeX(x,y)
        else:
            return self.placeO(x,y)
    def checkWin(self):
        
        rows = self.board.sum(axis=0)
        cols = self.board.sum(axis=1)
        
        if (3 in rows or 3 in cols or 
            3 in [self.board[0,0]+self.board[1,1]+self.board[2,2], self.board[0,2]+self.board[1,1]+self.board[2,0]] ):
            return 1
        elif (-3 in rows or -3 in cols or 
            -3 in [self.board[0,0]+self.board[1,1]+self.board[2,2], self.board[0,2]+self.board[1,1]+self.board[2,0]] ):
            return -1
        if self.moves == 9:
            return 0
        return None
    def __str__(self):
        return np.where(self.board>0,'X',np.where(self.board<0,'O','-')).__str__()

In [243]:
class Agent:
    def __init__(self, game,player):
        self.game = game
        if player in ['X', 1]:
            self.player = 1
        elif player in ['O', -1]:
            self.player = -1
        else:
            assert(False)
    def make_random_move(self):
        x,y = divmod(self.random_move(),3)
        return self.game.place(x,y)
    def random_move(self):
        return np.random.choice(torch.where(self.game.board.flatten() == 0)[0])
    def move(self,action):
        x,y = divmod(action,3)
        return self.game.place(x,y)  
    def illegal(self,action):
        if self.game.board[action[0],action[1]] != 0:
            return True
        return False
    def Flatten(self):
        return self.game.board.flatten()
    def state(self):
        return self.game.board.reshape([1,3,3]).clone()


In [4]:
Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward'))
class ReplayMemory(object):
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)
    def push(self, *args):
        self.memory.append(Transition(*args))
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    def __len__(self):
        return len(self.memory)

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
class DQN(nn.Module):
    
    def __init__(self, h,w, outputs):
        super(DQN, self).__init__()
        
        self.stack = nn.Sequential(
            nn.Conv1d(in_channels=3,out_channels=20,kernel_size=2),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Flatten(),
            nn.Linear(20, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64,32), 
            nn.ReLU(),
            nn.Linear(32,outputs),
            nn.Softmax(dim=1)
        )
        
    def forward(self, x):
        state = x.reshape([x.shape[0],9])
        logits = self.stack(x)
        return (state**2-1)*-1*logits

In [24]:
def select_action(agent, model, EPS_END, EPS_START, steps_done, EPS_DECAY):
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    if random.random() > eps_threshold:
        with torch.no_grad():
            return model(agent.state()).argmax().item()
    return agent.random_move()

In [240]:
def train_model(policy_net, target_net, trainer='random',memory=None, num_games=10000,
                loss_reward=-1,win_reward=1,move_reward=.1,tie_reward=0,
               BATCH_SIZE=128, GAMMA=0.999, EPS_START=.95, EPS_END=.05,EPS_DECAY=200, TARGET_UPDATE=10,
               device='cpu', mem_len=10000, player='X'):
    
    #global agent_opp
    agent_opp = None
    
    if trainer == 'random':
        model_opp = lambda x: agent_opp.random_move()
    elif type(trainer) == DQN:
        model_opp = lambda x: trainer(x).argmax().item()
    
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()
    steps_done = 0
    opponent = None
    if player in ['O',-1]:
        player = -1
        opponent = 1 
        
    elif player in ['X',1]:
        player = 1
        opponent = -1
    else:
        assert(False)
        
    optimizer = optim.RMSprop(policy_net.parameters())
    if memory is None:
        memory = ReplayMemory(mem_len)
    
    for i in tqdm(range(int(num_games))):
        steps_done += 1
        game = TicTacToe()
        agent = Agent(game, player)
        agent_opp = Agent(game, opponent)
        
        #if playing as O, opponent goes first
        if agent.player == -1:
            agent_opp.move(model_opp(agent_opp.state()))

        for t in count():
            state = agent.state()
            action = select_action(agent, policy_net, EPS_END, EPS_START, steps_done, EPS_DECAY)

            if agent.move(action):
                w = game.checkWin()
                if w == agent.player:
                    reward = win_reward
                    done = True
                elif w == 0:
                    reward = tie_reward
                    done = True
                else:
                    agent_opp.move(model_opp(agent_opp.state()))
                    w = game.checkWin()
                    if w == agent_opp.player:
                        reward = loss_reward
                        done = True
                    elif w == 0:
                        reward = tie_reward
                        done = True
                    #legal move, game did not end
                    else:
                        reward = move_reward
                        done = False
            else:
                reward = -10
                done = False



            next_state = agent.state()
            memory.push(state, action, next_state, reward)

            #Train model

            if len(memory) >= BATCH_SIZE:
                transitions = memory.sample(BATCH_SIZE)
                batch = Transition(*zip(*transitions))
                non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)),
                                             device=device, dtype=torch.bool)
                non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])

                state_batch = torch.cat(batch.state)
                action_batch = torch.tensor(batch.action).reshape([len(batch.action),1])
                reward_batch = torch.tensor(batch.reward)

                state_action_values = policy_net(state_batch).gather(1, action_batch)

                next_state_values = torch.zeros(BATCH_SIZE, device=device)
                next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
                expected_state_action_values = (next_state_values * GAMMA) + reward_batch

                criterion = nn.SmoothL1Loss()
                loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

                optimizer.zero_grad()
                loss.backward()
                for param in policy_net.parameters():
                    param.data.clamp(-1,1)
                optimizer.step()
            if done:
                break
                print('done')

            if t % TARGET_UPDATE == 0:
                target_net.load_state_dict(policy_net.state_dict())
            
    return policy_net, target_net, memory

In [244]:
model_X, model_Xtarget = DQN(3,3,9).to('cpu'), DQN(3,3,9).to('cpu')

In [238]:
model_O, model_Otarget = DQN(3,3,9).to('cpu'), DQN(3,3,9).to('cpu')

In [245]:
train_model(model_X, model_Xtarget, num_games=500, player='X')

  0%|          | 0/500 [00:00<?, ?it/s]

(DQN(
   (stack): Sequential(
     (0): Conv1d(3, 20, kernel_size=(2,), stride=(1,))
     (1): ReLU()
     (2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
     (3): Flatten(start_dim=1, end_dim=-1)
     (4): Linear(in_features=20, out_features=32, bias=True)
     (5): ReLU()
     (6): Linear(in_features=32, out_features=64, bias=True)
     (7): ReLU()
     (8): Linear(in_features=64, out_features=32, bias=True)
     (9): ReLU()
     (10): Linear(in_features=32, out_features=9, bias=True)
     (11): Softmax(dim=1)
   )
 ),
 DQN(
   (stack): Sequential(
     (0): Conv1d(3, 20, kernel_size=(2,), stride=(1,))
     (1): ReLU()
     (2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
     (3): Flatten(start_dim=1, end_dim=-1)
     (4): Linear(in_features=20, out_features=32, bias=True)
     (5): ReLU()
     (6): Linear(in_features=32, out_features=64, bias=True)
     (7): ReLU()
     (8): Linear(in_features=64, out_features=32, bias

In [246]:
train_model(model_O, model_Otarget, num_games=500, player='O')

  0%|          | 0/500 [00:00<?, ?it/s]

(DQN(
   (stack): Sequential(
     (0): Conv1d(3, 20, kernel_size=(2,), stride=(1,))
     (1): ReLU()
     (2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
     (3): Flatten(start_dim=1, end_dim=-1)
     (4): Linear(in_features=20, out_features=32, bias=True)
     (5): ReLU()
     (6): Linear(in_features=32, out_features=64, bias=True)
     (7): ReLU()
     (8): Linear(in_features=64, out_features=32, bias=True)
     (9): ReLU()
     (10): Linear(in_features=32, out_features=9, bias=True)
     (11): Softmax(dim=1)
   )
 ),
 DQN(
   (stack): Sequential(
     (0): Conv1d(3, 20, kernel_size=(2,), stride=(1,))
     (1): ReLU()
     (2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
     (3): Flatten(start_dim=1, end_dim=-1)
     (4): Linear(in_features=20, out_features=32, bias=True)
     (5): ReLU()
     (6): Linear(in_features=32, out_features=64, bias=True)
     (7): ReLU()
     (8): Linear(in_features=64, out_features=32, bias

In [254]:
def sim(game, modelX, agentX, modelO, agentO,verbose=False ):
    """
    Returns 1 if X wins
            -1 if O wins
            0 if a tie
    """
    w = game.checkWin()
    while w == None:
        action = modelX(agentX.state())
        agentX.move(action)
        w = game.checkWin()
        if verbose:
            print(game)
            print()
        if w is not None:
            return w
        action = modelO(agentO.state())
        agentO.move(action)
        w = game.checkWin()
        if verbose:
            print(game)
            print()
        if w is not None:
            return w
        
    return w

In [266]:
def validate(model, opp_model, player='X',N=1000, verbose=False):
    """
    Returns share of results for model
    [Loss, Tie, Win]
    """
    agent1, agent2 = None, None
    
    if model == 'random':
        strat = lambda x: agent1.random_move()
    elif type(model) == DQN:
        strat = lambda x: model(x).argmax().item()
    
    if opp_model == 'random':
        strat2 = lambda x: agent2.random_move()
    elif type(opp_model) == DQN:
        strat2 = lambda x: opp_model(x).argmax().item()
        
        
    opponent = 'X' if player == 'O' else 'O'
    results = np.empty(N)
    if player == 'X':
        for i in range(N):
            game = TicTacToe()
            agent1 = Agent(game, 'X')
            agent2 = Agent(game, 'O')
            results[i] = sim(game, strat, agent1, strat2, agent2,verbose=verbose)
    elif player == 'O':
        for i in range(N):
            
            game = TicTacToe()
            agent1 = Agent(game, 'X')
            agent2 = Agent(game, 'O')
            results[i] = sim(game, strat2, agent2, strat, agent1, verbose=verbose) * -1
    else:
        assert(False)
    v, c = np.unique(results, return_counts=True)
    stats = np.zeros([3])
    for i,j in zip(v,c):
        stats[int(i+1)] = j
    return stats /N

In [191]:
stats0 = validate('random', opp_model='random', player='X')
stats0

array([0.253, 0.126, 0.621])

In [192]:
stats1 = validate(model_X, opp_model='random', player='X')
stats1

array([0.102, 0.034, 0.864])

In [193]:
stats2 = validate(model_O, opp_model='random', player='O')
stats2

array([0.215, 0.072, 0.713])

### The model's playing as the other side

In [194]:
stats3 = validate(model_X, opp_model='random', player='O')
stats3

array([0.576, 0.135, 0.289])

In [195]:
stats4 = validate(model_O, opp_model='random', player='X')
stats4

array([0.179, 0.129, 0.692])

### The models playing against each other

In [261]:
stats5 = validate(model_X, opp_model=model_O, player='X',N=1, verbose=True)
stats5

[['-' '-' '-']
 ['X' '-' '-']
 ['-' '-' '-']]

[['-' 'O' '-']
 ['X' '-' '-']
 ['-' '-' '-']]

[['-' 'O' '-']
 ['X' '-' 'X']
 ['-' '-' '-']]

[['-' 'O' 'O']
 ['X' '-' 'X']
 ['-' '-' '-']]

[['-' 'O' 'O']
 ['X' '-' 'X']
 ['X' '-' '-']]

[['-' 'O' 'O']
 ['X' 'O' 'X']
 ['X' '-' '-']]

[['-' 'O' 'O']
 ['X' 'O' 'X']
 ['X' '-' 'X']]

[['O' 'O' 'O']
 ['X' 'O' 'X']
 ['X' '-' 'X']]



array([1., 0., 0.])

In [137]:
print(tabulate([stats1,stats2,stats3,stats4,stats5], showindex=[1,2,3,4,5], headers=['Loss','Tie','Win']))

      Loss    Tie    Win
--  ------  -----  -----
 1   0.27   0.091  0.639
 2   0.212  0.066  0.722
 3   0.531  0.187  0.282
 4   0.159  0.19   0.651
 5   0      0      1


## Train Models against each other

In [247]:
train_model(model_X, model_Xtarget, trainer=model_O, num_games=5000, player='X')

  0%|          | 0/5000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [250]:
train_model(model_O, model_Otarget, trainer=model_X, num_games=500, player='O')

  0%|          | 0/500 [00:00<?, ?it/s]

(DQN(
   (stack): Sequential(
     (0): Conv1d(3, 20, kernel_size=(2,), stride=(1,))
     (1): ReLU()
     (2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
     (3): Flatten(start_dim=1, end_dim=-1)
     (4): Linear(in_features=20, out_features=32, bias=True)
     (5): ReLU()
     (6): Linear(in_features=32, out_features=64, bias=True)
     (7): ReLU()
     (8): Linear(in_features=64, out_features=32, bias=True)
     (9): ReLU()
     (10): Linear(in_features=32, out_features=9, bias=True)
     (11): Softmax(dim=1)
   )
 ),
 DQN(
   (stack): Sequential(
     (0): Conv1d(3, 20, kernel_size=(2,), stride=(1,))
     (1): ReLU()
     (2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
     (3): Flatten(start_dim=1, end_dim=-1)
     (4): Linear(in_features=20, out_features=32, bias=True)
     (5): ReLU()
     (6): Linear(in_features=32, out_features=64, bias=True)
     (7): ReLU()
     (8): Linear(in_features=64, out_features=32, bias