<a href="https://colab.research.google.com/github/varuncs2011/rl/blob/master/Welcome_To_Colaboratory.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<p><img alt="Colaboratory logo" height="45px" src="/img/colab_favicon.ico" align="left" hspace="10px" vspace="0px"></p>

<h1>What is Colaboratory?</h1>

Colaboratory, or "Colab" for short, allows you to write and execute Python in your browser, with 
- Zero configuration required
- Free access to GPUs
- Easy sharing

Whether you're a **student**, a **data scientist** or an **AI researcher**, Colab can make your work easier. Watch [Introduction to Colab](https://www.youtube.com/watch?v=inN8seMm7UI) to learn more, or just get started below!

In [0]:
import requests
import itertools
import numpy as np
from collections import defaultdict

new_game = 'http://codingforfun.pmdx.me/v2/new'
make_move = 'http://codingforfun.pmdx.me/v2/play'

class Game:

    def __init__(self):
        self.action_space = {}
        self.num_actions = 9
        self.game_code = ""
        self.robot_says = 0
        self.epsilon = 0.1

    def restart_game(self):
        resp = requests.post(url=new_game)
        self.game_code = resp.json()['game_code']
        self.robot_says = resp.json()['robot_says']

    def createEpsilonGreedyPolicy(self, Q, state, prev_action):


        action_probabilities = np.ones(9, dtype=float) * self.epsilon / self.num_actions

        best_action = np.argmax(Q[state][prev_action])
        action_probabilities[best_action] += (1.0 - self.epsilon)
        return action_probabilities


    def qlearning(self, num_episodes, discount_factor=0.91, alpha=0.6):


        # state -> (action -> action-value).
        Q = defaultdict(lambda: np.zeros((self.num_actions, self.num_actions)))


        # For every episode
        for i_episode in range(num_episodes):
            if i_episode > 1200:
                self.epsilon = 0

            # Reset the environment and pick the first action
            self.restart_game()
            prev_action = 0

            for _ in itertools.count():


                state = self.robot_says
                # get probabilities of all actions from current state
                action_probabilities = self.createEpsilonGreedyPolicy(Q, state, prev_action)

                # choose action according to
                # the probability distribution
                action = np.random.choice(np.arange(len(action_probabilities)), p=action_probabilities)

                # take action and get reward, transit to next state
                next_state, reward, done = self.step(action)

                # TD Update
                best_next_action = np.argmax(Q[next_state][action])
                td_target = reward + discount_factor * Q[next_state][action][best_next_action]
                td_delta = td_target - Q[state][prev_action][action]
                Q[state][prev_action][action] += alpha * td_delta


                if done:
                    break
                prev_action = action
        return Q

    def step(self, action):
        params = {
            "game_code": self.game_code,
            "agent_says": int(action+1)
        }
        resp = requests.post(url=make_move, json=params)
        agent_rounds = resp.json()['agent_rounds']
        robot_rounds = resp.json()['robot_rounds']

        current_round = str(resp.json()['outcome']).split(" ")[0]
        done = False

        if agent_rounds == 5 or robot_rounds == 5:
            done = True
            robot_says = 0
        else:
            robot_says = resp.json()['robot_says']

        if current_round == 'Agent':
            return int(robot_says), 1, done
        else:
            return int(robot_says), -1, done


run = Game()
Q  = run.qlearning(1500)


In [26]:
class Game1:

    def __init__(self):
        self.action_space = {}
        self.num_actions = 9
        self.game_code = ""
        self.robot_says = 0
        self.epsilon = 0.1
        self.agent_wins = 0
        self.robot_wins = 0

    def restart_gamet(self):
        resp = requests.post(url=new_game)
        self.game_code = resp.json()['game_code']
        self.robot_says = resp.json()['robot_says']

    def createEpsilonGreedyPolicyt(self, Q, state, prev_action):

        action_probabilities = np.ones(9, dtype=float) * self.epsilon / self.num_actions

        best_action = np.argmax(Q[state][prev_action])
        action_probabilities[best_action] += (1.0 - self.epsilon)
        return action_probabilities

    def qlearningt(self, Q_Value, num_episodes, discount_factor=0.91, alpha=0.6):


        # state -> (action -> action-value).
        Q = Q_Value


        arr_reward = []
        # For every episode
        for i_episode in range(num_episodes):
            self.epsilon = 0

            # Reset the environment and pick the first action
            self.restart_gamet()
            prev_action = 0

            for _ in itertools.count():


                state = self.robot_says
                # get probabilities of all actions from current state
                action_probabilities = self.createEpsilonGreedyPolicyt(Q, state, prev_action)

                # choose action according to
                # the probability distribution
                action = np.random.choice(np.arange(len(action_probabilities)), p=action_probabilities)

                # take action and get reward, transit to next state
                next_state, reward, done = self.stept(action)

                arr_reward.append(reward)

                # TD Update
                best_next_action = np.argmax(Q[next_state][action])
                td_target = reward + discount_factor * Q[next_state][action][best_next_action]
                td_delta = td_target - Q[state][prev_action][action]
                Q[state][prev_action][action] += alpha * td_delta


                if done:
                    break
                prev_action = action
        print("Agent "+ str(self.agent_wins))
        print("Robot "+ str(self.robot_wins))
        
    def stept(self, action):
        params = {
            "game_code": self.game_code,
            "agent_says": int(action+1)
        }
        resp = requests.post(url=make_move, json=params)
        agent_rounds = resp.json()['agent_rounds']
        robot_rounds = resp.json()['robot_rounds']

        current_round = str(resp.json()['outcome']).split(" ")[0]
        done = False

        if agent_rounds == 5 or robot_rounds == 5:
            done = True
            robot_says = 0
            if current_round=='Robot':
              self.robot_wins += 1
            else:
              self.agent_wins += 1
        else:
            robot_says = resp.json()['robot_says']

        if current_round == 'Agent':
            return int(robot_says), 1, done
        else:
            return int(robot_says), -1, done

run = Game1()
run.qlearningt(Q, 200)

Agent 200
Robot 0


In [55]:
import pandas

temp = dict(Q)
row_labels = ['1', '2', '3', '4','5', '6', '7', '8','9']
column_labels = ['1', '2', '3', '4','5', '6', '7', '8','9']

for key, value in temp.items():
  print(key)
  print("***")
  for t in itertools.count():
    print(str(t) + ":" + str(value[t].argmax()))
    if t==8:
      break
    

  


7
***
0:8
1:6
2:1
3:8
4:0
5:8
6:8
7:0
8:0
4
***
0:8
1:7
2:6
3:5
4:5
5:4
6:0
7:0
8:0
3
***
0:8
1:7
2:4
3:2
4:2
5:0
6:3
7:2
8:0
0
***
0:0
1:0
2:0
3:0
4:0
5:0
6:0
7:0
8:0
2
***
0:8
1:8
2:5
3:6
4:5
5:3
6:2
7:2
8:0
5
***
0:8
1:5
2:0
3:3
4:7
5:4
6:8
7:0
8:0
6
***
0:8
1:7
2:4
3:4
4:8
5:1
6:1
7:3
8:0
8
***
0:8
1:6
2:8
3:8
4:7
5:8
6:4
7:0
8:0
