In [None]:
import gym
import numpy
import random
import pandas
import matplotlib
from matplotlib import pyplot as plt

class QLearn:
    def __init__(self, actions, epsilon, alpha, gamma,epsilon_decay):
        self.q = {}
        self.epsilon = epsilon  # exploration constant
        self.alpha = alpha      # discount constant
        self.epsilon_decay = epsilon_decay     # Epsilon Decay constant
        self.gamma = gamma      # discount factor
        self.actions = actions

    def getQ(self, state, action):
        return self.q.get((state, action), 0.0)

    def learnQ(self, state, action, reward, value):
        '''
        Q-learning:
            Q(s, a) += alpha * (reward(s,a) + max(Q(s') - Q(s,a))
        '''
        oldq = self.q.get((state, action), None)
        if oldq is None:
            self.q[(state, action)] = reward
        else:
            self.q[(state, action)] = oldq + self.alpha * (value - oldq)
            

    def chooseAction(self, state, return_q=False):
        q = [self.getQ(state, a) for a in self.actions]
        maxQ = max(q)

        if random.random() < self.epsilon:
            self.epsilon = self.epsilon*self.epsilon_decay
            minQ = min(q); mag = max(abs(minQ), abs(maxQ))
            # add random values to all the actions, recalculate maxQ
            q = [q[i] + random.random() * mag - .5 * mag for i in range(len(self.actions))]
            maxQ = max(q)

        count = q.count(maxQ)
        # In case there're several state-action max values
        # we select a random one among them
        if count > 1:
            best = [i for i in range(len(self.actions)) if q[i] == maxQ]
            i = random.choice(best)
        else:
            i = q.index(maxQ)

        action = self.actions[i]
        if return_q: # if they want it, give it!
            return action, q
        return action

    def learn(self, state1, action1, reward, state2):
        maxqnew = max([self.getQ(state2, a) for a in self.actions])
        #maxqnew = self.getQ(state2, action) SARSA
        
        self.learnQ(state1, action1, reward, reward + self.gamma*maxqnew)

def build_state(features):
    return int("".join(map(lambda feature: str(int(feature)), features)))

def to_bin(value, bins):
    return numpy.digitize(x=[value], bins=bins)[0]


if __name__ == '__main__':
    env = gym.make('CartPole-v0')
    
    env = gym.wrappers.Monitor(env, r'C:\Users\zarat\OneDrive\Msc Data Science\Software Agents\Logs', force=True)

    goal_average_steps = 195
    max_number_of_steps = 200
    last_time_steps = numpy.ndarray(0)
    n_bins = 8
    n_bins_angle = 10

    number_of_features = env.observation_space.shape[0]
    last_time_steps = numpy.ndarray(0)
    Rewards = numpy.ndarray(0)

    # Number of states is huge so in order to simplify the situation
    # we discretize the space to: 10 ** 4
    cart_position_bins = pandas.cut([-2.4, 2.4], bins=n_bins, retbins=True)[1][1:-1]
    pole_angle_bins = pandas.cut([-2, 2], bins=n_bins_angle, retbins=True)[1][1:-1]
    cart_velocity_bins = pandas.cut([-1, 1], bins=n_bins, retbins=True)[1][1:-1]
    angle_rate_bins = pandas.cut([-3.5, 3.5], bins=n_bins_angle, retbins=True)[1][1:-1]


    # The Q-learning algorithm
    qlearn = QLearn(actions=range(env.action_space.n),
                    alpha=0.5, gamma=0.90, epsilon=1, epsilon_decay = 0.999)

    for i_episode in range(10000):
        observation = env.reset()

        cart_position, pole_angle, cart_velocity, angle_rate_of_change = observation            
        state = build_state([to_bin(cart_position, cart_position_bins),
                         to_bin(pole_angle, pole_angle_bins),
                         to_bin(cart_velocity, cart_velocity_bins),
                         to_bin(angle_rate_of_change, angle_rate_bins)])
        
        cumulated_reward = 0

        for t in range(max_number_of_steps):	    	
            # env.render()

            # Pick an action based on the current state
            action = qlearn.chooseAction(state)
            # Execute the action and get feedback
            observation, reward, done, info = env.step(action)

            # Digitize the observation to get a state
            cart_position, pole_angle, cart_velocity, angle_rate_of_change = observation            
            nextState = build_state([to_bin(cart_position, cart_position_bins),
                             to_bin(pole_angle, pole_angle_bins),
                             to_bin(cart_velocity, cart_velocity_bins),
                             to_bin(angle_rate_of_change, angle_rate_bins)])

            if not(done):
                qlearn.learn(state, action, reward, nextState)
                state = nextState
                cumulated_reward += reward 
                
            else:
                # Q-learn 
                reward = -500
                qlearn.learn(state, action, reward, nextState)
                last_time_steps = numpy.append(last_time_steps, [int(t + 1)])
                cumulated_reward += reward
                break

        l = last_time_steps.tolist()
        l.sort()
        print("Overall score: {:0.2f}".format(last_time_steps.mean()))
        
        Rewards = numpy.append(Rewards, [int(cumulated_reward)])
        print("Episode {:d} reward score: {:0.2f}".format(i_episode, cumulated_reward))

    env.close()
    
Rewards = pandas.DataFrame(Rewards)
l = pandas.DataFrame(l)
Rewards.to_csv("default_Rewards.csv")
l.to_csv("default_Steps.csv")

import os
os._exit(00)
