# Balanding Cart Pole

Alternative implementation of Cart Pole balancing algorithm using pyTorch. Main ideas come from a great book Deep Reinforcement Learning Hands-On by Maxim Laptan.

In [1]:
import gym
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim

HIDDEN_SIZE = 128
BATCH_SIZE = 20    # number of episodes to complete before agent update
PERCENTILE = 0.8   # how many episodes to train from

In [2]:
env = gym.make("CartPole-v0")

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [4]:
agent = nn.Sequential(
            nn.Linear(4, HIDDEN_SIZE),
            nn.ReLU(),
            nn.Linear(HIDDEN_SIZE, 2)
        )
sm = nn.Softmax()

objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=agent.parameters(), lr=0.01)

episodes = []
state_history = []
action_history = []
episode_reward = 0.0
batch_episode_count = 0
state = env.reset()

n_episodes = 0

while True:
    action_prob = sm(agent.forward(torch.FloatTensor(state)))   
    action = np.random.choice([0, 1], p=action_prob.detach().numpy())   
    state_history.append(state)
    
    action_history.append(action)
    state, reward, done, info = env.step(action)
    episode_reward += reward
    
    if done:
        n_episodes += 1
        episodes.append([episode_reward,
                         np.array(state_history),
                         np.array(action_history)])
        # repeat for BATCH_SIZE episodes
        if len(episodes) == BATCH_SIZE:
            # Take best state action pairs based on given percentile           
            df = pd.DataFrame(episodes, columns=['reward', 'state', 'action'])
            p = df['reward'].quantile(PERCENTILE)
            df = df[df['reward'] > p]
            # Print some stats
            print(n_episodes, p)
            if p == 200.0:
                print('WIN!!!')
                break
            # Run optimizer
            optimizer.zero_grad()
            action_scores = agent.forward(torch.FloatTensor(
                np.concatenate(df['state'].values)))
            taken_actions = torch.LongTensor(
                np.concatenate(df['action'].values))
            loss = objective(action_scores, taken_actions)
            loss.backward()
            optimizer.step()
            # reset batch
            episodes = []

        # reset episode holders
        episode_reward = 0.0
        state_history = []
        action_history = []
        state = env.reset()



20 36.400000000000006
40 22.800000000000008
60 27.200000000000003
80 37.2
100 44.400000000000006
120 55.20000000000001
140 31.400000000000006
160 48.00000000000001
180 38.40000000000002
200 48.400000000000006
220 89.60000000000002
240 89.4
260 95.4
280 130.6
300 93.40000000000003
320 86.20000000000002
340 95.2
360 117.4
380 133.8
400 117.2
420 145.60000000000002
440 148.0
460 174.60000000000002
480 136.0
500 149.4
520 176.6
540 175.0
560 179.20000000000005
580 200.0
WIN!!!
