In [None]:
#notebook by Visarg,Avil and Aryan

In [None]:
#Training code for task3

In [None]:
#The output will be Scores with the corresponding episode number and the training log(graph of Episodes v/s Scores)

In [None]:
# --versions--
# numpy==1.18.2
# gym==0.17.1
# Keras==2.3.1
# Keras-Applications==1.0.8
# Keras-Preprocessing==1.1.0
# tensorboard==2.1.1
# tensorflow==2.1.0
# tensorflow-estimator==2.1.0
# matplotlib==3.2.1

In [None]:
import numpy as np 
import gym 
from collections import deque
from keras.models import Model
from keras.layers import Input,Dense
from keras.optimizers import RMSprop 
import random
import matplotlib.pyplot as plt
from statistics import mean

In [None]:
class Agent(): 
    
    def __init__(self):
        self.env = gym.make('CartPole-v1')
        self.state_size = self.env.observation_space.shape[0] 
        self.action_size = self.env.action_space.n 
    
        self.gamma = 0.95
        self.learning_rate = 0.00025
        self.batch_size = 40
    
        self.epsilon = 1 
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.001
        
        self.memory = deque(maxlen = 2000)
        
        self.model = self.build_model()

    def build_model(self):
        input_shape=(self.state_size,)
        action_space=self.action_size
        X_in = Input(input_shape)
        X = Dense(512, input_shape=input_shape, activation="relu", kernel_initializer='he_uniform')(X_in)
        X = Dense(256, activation="relu", kernel_initializer='he_uniform')(X)
        X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)
        X = Dense(action_space, activation="linear", kernel_initializer='he_uniform')(X)
        model = Model(inputs = X_in, outputs = X, name='CartPole DQN model')
        model.compile(loss="mse", optimizer=RMSprop(lr=self.learning_rate, rho=0.95, epsilon=0.01), metrics=["accuracy"])
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    
    def act(self, state):
        if random.uniform(0,1) <= self.epsilon:
            return self.env.action_space.sample()
        else:
            return np.argmax(self.model.predict(state)[0])
            
    def replay(self):
        if len(self.memory) < 850:
            return
        minibatch = random.sample(self.memory, min(len(self.memory), self.batch_size))

        state = np.zeros((self.batch_size, self.state_size))
        next_state = np.zeros((self.batch_size, self.state_size))
        action, reward, done = [], [], []

        for i in range(self.batch_size):
            state[i] = minibatch[i][0]
            action.append(minibatch[i][1])
            reward.append(minibatch[i][2])
            next_state[i] = minibatch[i][3]
            done.append(minibatch[i][4])

        target = self.model.predict(state)
        target_next = self.model.predict(next_state)

        for i in range(self.batch_size):
            if done[i]:
                target[i][action[i]] = reward[i]
            else:
                target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_next[i]))
        self.model.fit(state, target, batch_size=self.batch_size, verbose=0)
    
    def exploration(self):
        if len(self.memory) > 850:
            if self.epsilon > self.epsilon_min:
                self.epsilon *= self.epsilon_decay
            
    def train(self):
        episodes=[]
        final_scores=[]
        for e in range(500): # maximum episodes to train = 500
            
            state = self.env.reset() #reseting env for each episode
            state = np.reshape(state, [1,4])
            
            score = 0 #initialising score for each episode
            done = False
            
            while not done:
                action = self.act(state) #choosing action according to the state
                next_state, reward, done, _ = self.env.step(action) #taking action from above step
                next_state = np.reshape(next_state, [1,4]) 
                self.remember(state, action, reward, next_state, done) #pushing info in deque
                state = next_state #updating state
                self.exploration() #updating epsilon
                score+=1 #incrementing score as the reward for survival is 1
                
                if done: #when game(episode) is over
                    final_scores.append(score)
                    episodes.append(e+1)
                    print("Episode: {}, Score: {}".format(e+1, score))
                    if score == 500: # Stoping the training when score reaches 500 for the first time
                        print("Number of episodes to train : {}".format(e+1))
                        print("--Saving trained model--")
                        self.model.save("model_t3.h5") #saving model
                        #Training logs
                        plt.figure(1) 
                        plt.plot(episodes,final_scores)
                        plt.xlabel('Episodes')
                        plt.ylabel('Scores')
                        plt.title('Training')
                        plt.show()
                        return
                self.replay() #updating weights of NN for better prediction                            

In [None]:
agent = Agent()
agent.train()