# Cart Pole

Source code https://github.com/seungeunrho/minimalRL/blob/7095e0f9ffa408959842528714ba687743661c14/REINFORCE.py

A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. The pendulum starts upright, and the goal is to prevent it from falling over by increasing and reducing the cart's velocity 

Library
- OpenAI gym
- Pytorch



In [2]:
!pip3 install torch



In [10]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

#Hyperparameters
learning_rate = 0.0002
gamma         = 0.98

#gradient policy
class Policy(nn.Module): 
    def __init__(self): # initialize
        super(Policy, self).__init__()
        self.data = [] 
        
        # make a model
        self.fc1 = nn.Linear(4, 128) #4D to 128D
        self.fc2 = nn.Linear(128, 2) # 128D to 2D
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
        
    # input > H1, 128D (fully connected, Relu) > H2, 2D(Fully Connected) >> pi(softmax) for making probability)    
    # making neural network
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.softmax(self.fc2(x), dim=0)
        return x
    
    # updating data  
    def put_data(self, item):
        self.data.append(item)
    
    # checking data backword to calculating the loss then optimise the loss 
    def train_net(self):
        R = 0
        self.optimizer.zero_grad()
        for r, prob in self.data[::-1]:
            R = r + gamma * R  # return R always mutiplied by gamma if 50 ticks then 50 times multiplication
            loss = -torch.log(prob) * R # logPi(s,a) Vt(Retrun), gradient accentic 
            loss.backward() # gradient calculation each weight
        self.optimizer.step() 
        self.data = []

def main():
    env = gym.make('CartPole-v1') #env 
    pi = Policy() # call policy class
    score = 0.0 
    print_interval = 20
    
    
    for n_epi in range(10000): #10000 episode
        s = env.reset()
        done = False
        
        while not done: 
            prob = pi(torch.from_numpy(s).float()) #4D vector, probability of force to the right or left
            m = Categorical(prob) # type of model
            a = m.sample()
            s_prime, r, done, info = env.step(a.item()) #give action, state transition
            pi.put_data((r,prob[a])) #store data in the Policy
            s = s_prime
            score += r
                        
        pi.train_net()
        
        # show current tranning and average timestep 
        if n_epi%print_interval==0 and n_epi!=0:
            print("# of episode :{}, avg score : {}".format(n_epi, score/print_interval))
            score = 0.0
    env.close()
    
if __name__ == '__main__':
    main()

# of episode :20, avg score : 17.45
# of episode :40, avg score : 18.0
# of episode :60, avg score : 22.65
# of episode :80, avg score : 25.5
# of episode :100, avg score : 19.8
# of episode :120, avg score : 21.9
# of episode :140, avg score : 20.5
# of episode :160, avg score : 25.1
# of episode :180, avg score : 22.9
# of episode :200, avg score : 24.45
# of episode :220, avg score : 36.45
# of episode :240, avg score : 22.6
# of episode :260, avg score : 35.65
# of episode :280, avg score : 32.9
# of episode :300, avg score : 36.75
# of episode :320, avg score : 29.5
# of episode :340, avg score : 37.4
# of episode :360, avg score : 36.25
# of episode :380, avg score : 33.35
# of episode :400, avg score : 37.25
# of episode :420, avg score : 43.3
# of episode :440, avg score : 32.2
# of episode :460, avg score : 32.6
# of episode :480, avg score : 45.4
# of episode :500, avg score : 50.55
# of episode :520, avg score : 49.15
# of episode :540, avg score : 43.75
# of episode :560, a

# of episode :4340, avg score : 447.8
# of episode :4360, avg score : 428.5
# of episode :4380, avg score : 432.7
# of episode :4400, avg score : 424.3
# of episode :4420, avg score : 457.7
# of episode :4440, avg score : 462.15
# of episode :4460, avg score : 407.2
# of episode :4480, avg score : 432.95
# of episode :4500, avg score : 441.1
# of episode :4520, avg score : 458.95
# of episode :4540, avg score : 454.2
# of episode :4560, avg score : 450.2
# of episode :4580, avg score : 425.95
# of episode :4600, avg score : 462.8
# of episode :4620, avg score : 435.0
# of episode :4640, avg score : 443.35
# of episode :4660, avg score : 426.0
# of episode :4680, avg score : 420.9
# of episode :4700, avg score : 460.3
# of episode :4720, avg score : 431.15
# of episode :4740, avg score : 459.25
# of episode :4760, avg score : 419.1
# of episode :4780, avg score : 368.0
# of episode :4800, avg score : 340.7
# of episode :4820, avg score : 407.0
# of episode :4840, avg score : 423.45
# of

# of episode :8600, avg score : 491.5
# of episode :8620, avg score : 490.4
# of episode :8640, avg score : 476.7
# of episode :8660, avg score : 467.2
# of episode :8680, avg score : 431.85
# of episode :8700, avg score : 452.45
# of episode :8720, avg score : 463.85
# of episode :8740, avg score : 474.35
# of episode :8760, avg score : 481.95
# of episode :8780, avg score : 474.9
# of episode :8800, avg score : 500.0
# of episode :8820, avg score : 452.7
# of episode :8840, avg score : 488.4
# of episode :8860, avg score : 500.0
# of episode :8880, avg score : 491.95
# of episode :8900, avg score : 489.35
# of episode :8920, avg score : 488.4
# of episode :8940, avg score : 457.0
# of episode :8960, avg score : 488.15
# of episode :8980, avg score : 483.5
# of episode :9000, avg score : 497.3
# of episode :9020, avg score : 466.6
# of episode :9040, avg score : 491.1
# of episode :9060, avg score : 470.55
# of episode :9080, avg score : 500.0
# of episode :9100, avg score : 451.05
# 