In [4]:
import gym
import torch
from torch import nn, optim
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from copy import deepcopy

In [5]:
print(torch.cuda.is_available())

True


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
env = gym.make('LunarLander-v2')
env.seed(5)
print('State shape: ', env.observation_space)
print('Number of Actions: ', env.action_space)

State shape:  Box(-inf, inf, (8,), float32)
Number of Actions:  Discrete(4)


In [8]:
#Code to check if env is working correctly 
env.reset()
for _ in range(1000):
    env.render()
    env.step(env.action_space.sample()) # take a random action
env.close()

In [9]:
next_state, reward, done, data = env.step(2)
reward
reward = torch.from_numpy(np.array([reward])).to(device)
print(reward)
next_state = torch.from_numpy(next_state).to(device)
print(next_state)

tensor([-100], device='cuda:0', dtype=torch.int32)
tensor([-9.2983e-02, -1.4844e-03, -1.0857e-02,  5.6890e-02, -9.9618e-04,
         1.8996e-01,  1.0000e+00,  1.0000e+00], device='cuda:0')


In [12]:
class Network(nn.Module):
    def __init__(self,seed):
        super(Network, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(8, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 4)
    
    def forward(self, state):
        h = F.relu(self.fc1(state))
        h = F.relu(self.fc2(h))
        y = self.fc3(h)
        return y

TypeError: cuda() missing 1 required positional argument: 'self'

In [15]:
a = torch.from_numpy(env.reset())
a

tensor([ 6.6900e-04,  1.4214e+00,  6.7750e-02,  4.6374e-01, -7.6845e-04,
        -1.5346e-02,  0.0000e+00,  0.0000e+00])

In [21]:
m = Network(5)
m.cuda()
c = m(next_state)
c


tensor([-0.0775, -0.1121,  0.0074, -0.0513], device='cuda:0',
       grad_fn=<AddBackward0>)

In [23]:
class QLearningAgent(object):
    def __init__(self,alpha,gamma,epsilon,n_eps,N,C,M,seed):
        self.memory = []
        self.memory_max = N
        self.target_update = C
        self.Q_t = Network(seed).to(device)
        self.Q = Network(seed).to(device)
        self.alpha = alpha
        self.optimizer = optim.SGD(self.Q.parameters(), lr=self.alpha)
        self.gamma = gamma
        self.epsilon = epsilon
        self.seed = seed
        self.C = C
        self.n_eps = n_eps
        self.mini_batch_size = M
        self.env = gym.make('LunarLander-v2')
        env.seed(seed)
    
    def store_memory(state,action,reward,next_state,done = 0):
        reward = np.array([reward],dtype = float)
        action = np.array([action],dtype = int)
        done = np.array([done],dtype = int)
        self.memory.append((state,action,reward,next_state,done))
    
    def sample_memory(M):
        batch = np.array(random.sample(self.memory, k=M),dtype = object)
        batch = batch.T
        batch = batch.tolist()
        return (torch.tensor(batch[0]).to(device),torch.tensor(batch[1]).to(device),torch.tensor(batch[2],dtype = torch.float).to(device),torch.tensor(batch[3]).to(device),torch.tensor(batch[4]).to(device))
    
    def solve(self):
        states = self.env.observation_space
        actions = self.env.action_space
        np.random.seed(self.seed)
        count = 0
        scores = []
        for eps in range(self.n_eps):
            state = torch.from_numpy(self.env.reset()).to(device)
            score = 0
            for i in range(1000000):
                greed = np.random.random()
                #Feed Forward once to predict the best action for current state
                self.Q.eval()
                with torch.no_grad():
                    actions = self.Q(state)
                self.Q.train()
                if greed < self.epsilon:
                    action = np.random.randint(0, 4)
                else:
                    action = np.argmax(actions.detach().numpy())
                next_state, reward, done, data = self.env.step(action)
                score+=reward
                self.store_memory(state,action,reward,next_state,done)
                
                if len(self.memory)<self.mini_batch_size:
                    break
                else:
                    transitions = self.sample_memory(self.mini_batch_size)
                
                state,action,reward,next_state,done = transitions
                Q_t = self.Q_t(next_states).detach()
                Q_tmax = Q_t.max(1)[0].unsqueeze(1)
                case2 = rewards + self.gamma * Q_tmax
                case1 = rewards
#                 y_j = rewards + (gamma * Q_tmax * (1-done))  
                y = torch.where(done<1,case2,case1)    
                Q = self.Q(states).gather(1, actions)
                loss = F.mse_loss(Q, y)
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                    
                if count == self.C:
                    self.Q_t = deepcopy(self.Q)
#                     self.Q_t.load_state_dict(self.Q.state_dict())
                
                state = deepcopy(next_state)
                if done:
                    break
            scores.append(score)
            print(scores)


In [33]:
# model = TargetPolicy()
# criterion = nn.NLLLoss()
# optimizer = optim.SGD(model.parameters(), lr=0.003)
# print(model.parameters())

<generator object Module.parameters at 0x000002110B77D2E0>


In [35]:
s = torch.from_numpy(env.reset())
s

tensor([-0.0029,  1.4095, -0.2904, -0.0634,  0.0033,  0.0658,  0.0000,  0.0000])