In [1]:
import numpy as np

In [2]:
class Game:
    def __init__(self):
        raise NotImplementedError
    def reset(self):
        raise NotImplementedError
    def current_state(self):
        raise NotImplementedError
        
    def possible_actions(self):
        raise NotImplementedError
    def play(self,action):
        #change state of the game
        #return reward
        raise NotImplementedError

In [4]:
from random import random, choice
class RPS(Game):
    def __init__(self):
        self.cs = None
        self.reset()
        
    def reset(self):
        self.cs = choice(["r", "p", "s"])
        
    def current_state(self):
        return self.cs
        
    def possible_actions(self):
        return ("r", "p", "s")
        
    def play(self,action):
        #rewards[state][action] = score for player
        rewards ={
            'r': {"r": 0, "p": 1, "s": -1},
            "p": {"r":-1, "p": 0, "s":  1},
            "s": {"r": 1, "p": -1, "s": 0}
        }
        reward = rewards[self.cs][action]
        self.cs = None
        return reward

In [15]:
class Bot:
    def __init__(self):
        raise NotImplementedError
    def decide(self, game):
        #look through all possible actions
        #pick 1
        #return action
        raise NotImplementedError
    def learn(self,old_state, reward):
        #update your q
        raise NotImplementedError

In [10]:
mygame = RPS()

In [11]:
mygame.play('p')

1

In [66]:
class RPSBot(Bot):
    def __init__(self):
        self.Q = {}
        self.default_value = 0.5
        
    def decide(self, game, epsilon = 0.1):        
        pa = game.possible_actions()
        cs = game.current_state()
        epsilon = 0 # for winning all the time 
        
        if cs not in self.Q:
            self.Q[cs] = {k: self.default_value for k in pa}
        
        should_greedy = random() > epsilon
        
        if should_greedy:
            #do greedy
            return max(self.Q[cs], key = self.Q[cs].get)
        else:
            # random
            return choice(pa)
        
    def learn(self,old_state,action,reward, learning_rate = 0.1):
        #update your q
        oldQ = self.Q[old_state][action]
        newQ = oldQ + learning_rate *(reward - oldQ)
        self.Q[old_state][action] = newQ

In [67]:
class GameShop:
    def __init__(self,game,player):
        self.game = game
        self.player = player
    
    def gym(self, epsilon, n = 10000):
        # force the bot sit there the play the game
        # slap if it fails give yoyos otherwise
        
        for i in xrange(n):
            self.game.reset()
            game_end = False
            while not game_end:
                old_state = self.game.current_state()
                action = self.player.decide(self.game,epsilon)
                reward = self.game.play(action)
                self.player.learn(old_state,action,reward)
                game_end = game.current_state() is None

In [70]:
bot = RPSBot()
game = RPS()

trainer = GameShop(game, bot)
trainer.gym(0.1)

score = 0
n = 1000
for i in xrange(n):
    game.reset()
    score+= game.play(trainer.player.decide(game))
print   score

1000


In [69]:
from pprint import pprint

pprint(bot.Q)

{'p': {'p': 0.45, 'r': 0.5, 's': 0.9999999999999996},
 'r': {'p': 0.9999999999999996, 'r': 0.5, 's': 0.5},
 's': {'p': 0.35, 'r': 0.9999999999999996, 's': 0.45}}
