In [1]:
import time

import gym
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
from tqdm import tqdm

In [2]:
device = 'cuda'

In [3]:
class Policy(nn.Module):
    def __init__(self,N_s,N_a):
        super().__init__()
        self.layer_1 = nn.Linear(N_s,200)
        self.layer_2 = nn.Linear(200,200)
        self.layer_3 = nn.Linear(200,N_a)
        self.N_s = N_s
        self.N_a = N_a
    
    def forward(self, s):
        assert type(s) == torch.Tensor
        
        h = F.leaky_relu(self.layer_1(s))
        h = F.leaky_relu(self.layer_2(h))
        h = F.leaky_relu(self.layer_3(h))
        return F.log_softmax(h,dim=-1)
    
    def sample_action(self,s):
        with torch.no_grad():
            P = torch.exp(self.forward(s)).detach().cpu().numpy()
        return np.random.choice([i for i in range(self.N_a)],p=P)
            

In [4]:
policy = Policy(4,2).to(device)
optimizer = torch.optim.Adam(policy.parameters(),lr=3e-4)

In [5]:
env = gym.make('CartPole-v0')

In [6]:
render = False
start = time.time()

Ss = []
As = []
Rs = []



for episode in range(1000):
    #Initialize
    Ss = []
    As = []
    Rs = []
    s = env.reset()
    if render:
        env.render()
    done = 0
    
    #Run Episode
    while(True):
        Ss.append(s)
        with torch.no_grad():
            s = torch.tensor(s,dtype=torch.float32).to(device)
        
        a = policy.sample_action(s)
        As.append(a)
        
        ns, r, done, info = env.step(a)
        Rs.append(r)
        s = ns
        
        if done:
            break
        else:
            if render:
                env.render()
    
    #When Episode ends
    Ss = torch.tensor(Ss,dtype=torch.float32).to(device)
    T = len(Rs)
    
    #Train
    optimizer.zero_grad()
    log_probs = policy(Ss)
    
    G = [0] * T
    g = 0
    for t in range(T-1,-1,-1):
        g = g + Rs[t]
        G[t] = g
        
    objective = 0
    for t in range(T):
        objective -= G[t] * log_probs[t,As[t]]
    #objective / T
    
    objective.backward()
    optimizer.step()
    if episode % 1 == 0:
        print(G[0])
            
env.close()

end = time.time()

print(f"done in {end - start}s")

26.0
41.0
26.0
21.0
22.0
23.0
18.0
15.0
45.0
14.0
34.0
19.0
39.0
44.0
16.0
23.0
10.0
17.0
15.0
11.0
11.0
10.0
15.0
18.0
29.0
19.0
16.0
20.0
30.0
67.0
51.0
20.0
18.0
19.0
14.0
14.0
14.0
12.0
23.0
12.0
27.0
15.0
11.0
44.0
19.0
16.0
20.0
22.0
10.0
21.0
30.0
31.0
59.0
51.0
26.0
40.0
40.0
37.0
27.0
40.0
18.0
49.0
13.0
23.0
43.0
18.0
27.0
48.0
26.0
13.0
51.0
21.0
23.0
11.0
26.0
24.0
42.0
12.0
20.0
36.0
24.0
17.0
9.0
30.0
18.0
15.0
15.0
40.0
25.0
27.0
40.0
40.0
12.0
29.0
17.0
35.0
16.0
40.0
16.0
15.0
13.0
43.0
63.0
28.0
27.0
31.0
18.0
16.0
62.0
20.0
21.0
17.0
45.0
33.0
38.0
23.0
18.0
46.0
45.0
28.0
11.0
56.0
37.0
16.0
57.0
31.0
28.0
10.0
24.0
31.0
40.0
62.0
16.0
37.0
31.0
15.0
39.0
24.0
14.0
91.0
21.0
62.0
17.0
37.0
29.0
56.0
38.0
58.0
118.0
37.0
67.0
24.0
29.0
25.0
30.0
31.0
26.0
58.0
44.0
16.0
13.0
39.0
34.0
98.0
25.0
38.0
38.0
89.0
99.0
29.0
46.0
17.0
34.0
31.0
49.0
61.0
51.0
59.0
30.0
72.0
110.0
69.0
95.0
45.0
59.0
105.0
29.0
43.0
22.0
11.0
62.0
81.0
12.0
17.0
48.0
39.0
43.0
58.0
45.0
36.