##PREPARE

Import file gameLogic.py, enumerateOptions, goForwardGame, PPONetwork.py, actionIndexTable.pkl

### Set up tensorflow_version

Tensorflow version be used must ver 1.x

In [1]:
%tensorflow_version 1.4

`%tensorflow_version` only switches the major version: 1.x or 2.x.
You set: `1.4`. This will be interpreted as: `1.x`.


TensorFlow 1.x selected.


In [2]:
import tensorflow
print(tensorflow.__version__)

1.15.2


### Install baselines

In [3]:
!git clone https://github.com/openai/baselines.git

Cloning into 'baselines'...
remote: Enumerating objects: 3627, done.[K
remote: Total 3627 (delta 0), reused 0 (delta 0), pack-reused 3627[K
Receiving objects: 100% (3627/3627), 6.46 MiB | 5.14 MiB/s, done.
Resolving deltas: 100% (2429/2429), done.


In [4]:
!pwd

/content


In [5]:
%cd baselines/

/content/baselines


In [6]:
pwd

'/content/baselines'

In [7]:
!pip install -e .


Obtaining file:///content/baselines
Collecting gym<0.16.0,>=0.15.4
[?25l  Downloading https://files.pythonhosted.org/packages/e0/01/8771e8f914a627022296dab694092a11a7d417b6c8364f0a44a8debca734/gym-0.15.7.tar.gz (1.6MB)
[K     |████████████████████████████████| 1.6MB 7.4MB/s 
Building wheels for collected packages: gym
  Building wheel for gym (setup.py) ... [?25l[?25hdone
  Created wheel for gym: filename=gym-0.15.7-cp37-none-any.whl size=1648842 sha256=eedc7a7bfabaa1bba55b999f0cb7fb818f461aae315c955a55280744267d454a
  Stored in directory: /root/.cache/pip/wheels/35/60/6a/f9c27ae133abaf5a5687ed2fa8ed19627d7fac5d843a27572b
Successfully built gym
[31mERROR: gym 0.15.7 has requirement cloudpickle~=1.2.0, but you'll have cloudpickle 1.3.0 which is incompatible.[0m
Installing collected packages: gym, baselines
  Found existing installation: gym 0.17.3
    Uninstalling gym-0.17.3:
      Successfully uninstalled gym-0.17.3
  Running setup.py develop for baselines
Successfully installed 

In [8]:
!pip install cloudpickle==1.2.0


Collecting cloudpickle==1.2.0
  Downloading https://files.pythonhosted.org/packages/f1/95/d329d89318b85f29936e6b121a662d5cf276a37aac7920ad1acc29de9757/cloudpickle-1.2.0-py2.py3-none-any.whl
Installing collected packages: cloudpickle
  Found existing installation: cloudpickle 1.3.0
    Uninstalling cloudpickle-1.3.0:
      Successfully uninstalled cloudpickle-1.3.0
Successfully installed cloudpickle-1.2.0


In [9]:
%cd ..

/content


In [10]:
%cd baselines/

/content/baselines


In [11]:
!ls

baselines		 benchmarks_mujoco1M.htm  docs	     setup.cfg
baselines.egg-info	 data			  LICENSE    setup.py
benchmarks_atari10M.htm  Dockerfile		  README.md


In [12]:
!pip install -e .

Obtaining file:///content/baselines
Installing collected packages: baselines
  Found existing installation: baselines 0.1.6
    Can't uninstall 'baselines'. No files were found to uninstall.
  Running setup.py develop for baselines
Successfully installed baselines


In [13]:
!pip list

Package                       Version             Location          
----------------------------- ------------------- ------------------
absl-py                       0.12.0              
alabaster                     0.7.12              
albumentations                0.1.12              
altair                        4.1.0               
appdirs                       1.4.4               
argon2-cffi                   20.1.0              
arviz                         0.11.2              
astor                         0.8.1               
astropy                       4.2.1               
astunparse                    1.6.3               
async-generator               1.10                
atari-py                      0.2.9               
atomicwrites                  1.4.0               
attrs                         21.2.0              
audioread                     2.1.9               
autograd                      1.3                 
Babel                         2.9.1           

In [14]:
%cd ..

/content


##RUN


###Set up Neural Network

In [15]:
import numpy as np
from PPONetwork import PPONetwork, PPOModel
from goForwardGame import vectorizedGoForwardGames
import tensorflow as tf
import joblib
import copy
import time

In [16]:
def sf01(arr):
    """
    swap and then flatten axes 0 and 1
    """
    s = arr.shape
    return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:])


In [17]:
class GoForwardPPOSimulation(object):
    
    def __init__(self, sess, *, inpDim = 732, nGames = 8, nSteps = 20, nMiniBatches = 4, nOptEpochs = 5, lam = 0.95, gamma = 0.995, ent_coef = 0.01, vf_coef = 0.5, max_grad_norm = 0.5, minLearningRate = 0.000001, learningRate, clipRange, saveEvery = 500):
        
        #network/model for training
        self.trainingNetwork = PPONetwork(sess, inpDim, 8032, "trainNet")
        self.trainingModel = PPOModel(sess, self.trainingNetwork, inpDim, 8032, ent_coef, vf_coef, max_grad_norm)
        
        #player networks which choose decisions - allowing for later on experimenting with playing against older versions of the network (so decisions they make are not trained on).
        self.playerNetworks = {}
        
        #for now each player uses the same (up to date) network to make it's decisions.
        self.playerNetworks[1] = self.playerNetworks[2] = self.playerNetworks[3] = self.playerNetworks[4] = self.trainingNetwork
        self.trainOnPlayer = [True, True, True, True]
        
        tf.global_variables_initializer().run(session=sess)
        
        #environment
        self.vectorizedGame = vectorizedGoForwardGames(nGames)
        
        #params
        self.nGames = nGames
        self.inpDim = inpDim
        self.nSteps = nSteps
        self.nMiniBatches = nMiniBatches
        self.nOptEpochs = nOptEpochs
        self.lam = lam
        self.gamma = gamma
        self.learningRate = learningRate
        self.minLearningRate = minLearningRate
        self.clipRange = clipRange
        self.saveEvery = saveEvery
        
        self.rewardNormalization = 5.0 #divide rewards by this number (so reward ranges from -1.0 to 3.0)
        
        #test networks - keep network saved periodically and run test games against current network
        self.testNetworks = {}
        
        # final 4 observations need to be carried over (for value estimation and propagating rewards back)
        self.prevObs = []
        self.prevGos = []
        self.prevAvailAcs = []
        self.prevRewards = []
        self.prevActions = []
        self.prevValues = []
        self.prevDones = []
        self.prevNeglogpacs = []
        
        #episode/training information
        self.totTrainingSteps = 0
        self.epInfos = []
        self.gamesDone = 0
        self.losses = []
        
    def run(self):
        #run vectorized games for nSteps and generate mini batch to train on.
        mb_obs, mb_pGos, mb_actions, mb_values, mb_neglogpacs, mb_rewards, mb_dones, mb_availAcs = [], [], [], [], [], [], [], []
        for i in range(len(self.prevObs)):
            mb_obs.append(self.prevObs[i])
            mb_pGos.append(self.prevGos[i])
            mb_actions.append(self.prevActions[i])
            mb_values.append(self.prevValues[i])
            mb_neglogpacs.append(self.prevNeglogpacs[i])
            mb_rewards.append(self.prevRewards[i])
            mb_dones.append(self.prevDones[i])
            mb_availAcs.append(self.prevAvailAcs[i])
        if len(self.prevObs) == 4:
            endLength = self.nSteps
        else:
            endLength = self.nSteps-4
        for _ in range(self.nSteps):
            currGos, currStates, currAvailAcs = self.vectorizedGame.getCurrStates()
            currStates = np.squeeze(currStates)
            currAvailAcs = np.squeeze(currAvailAcs)
            currGos = np.squeeze(currGos)
            actions, values, neglogpacs = self.trainingNetwork.step(currStates, currAvailAcs)
            #print(actions)
            rewards, dones, infos = self.vectorizedGame.step(actions)
            mb_obs.append(currStates.copy())
            mb_pGos.append(currGos)
            mb_availAcs.append(currAvailAcs.copy())
            mb_actions.append(actions)
            mb_values.append(values)
            mb_neglogpacs.append(neglogpacs)
            mb_dones.append(list(dones))
            #now back assign rewards if state is terminal
            toAppendRewards = np.zeros((self.nGames,))
            mb_rewards.append(toAppendRewards)
            for i in range(self.nGames):
                if dones[i] == True:
                    reward = rewards[i]
                    mb_rewards[-1][i] = reward[mb_pGos[-1][i]-1] / self.rewardNormalization
                    mb_rewards[-2][i] = reward[mb_pGos[-2][i]-1] / self.rewardNormalization
                    mb_rewards[-3][i] = reward[mb_pGos[-3][i]-1] / self.rewardNormalization
                    mb_rewards[-4][i] = reward[mb_pGos[-4][i]-1] / self.rewardNormalization
                    mb_dones[-2][i] = True
                    mb_dones[-3][i] = True
                    mb_dones[-4][i] = True
                    self.epInfos.append(infos[i])
                    self.gamesDone += 1
                    print("Game %d finished. Lasted %d turns" % (self.gamesDone, infos[i]['numTurns']))
        self.prevObs = mb_obs[endLength:]
        self.prevGos = mb_pGos[endLength:]
        self.prevRewards = mb_rewards[endLength:]
        self.prevActions = mb_actions[endLength:]
        self.prevValues = mb_values[endLength:]
        self.prevDones = mb_dones[endLength:]
        self.prevNeglogpacs = mb_neglogpacs[endLength:]
        self.prevAvailAcs = mb_availAcs[endLength:]
        mb_obs = np.asarray(mb_obs, dtype=np.float32)[:endLength]
        mb_availAcs = np.asarray(mb_availAcs, dtype=np.float32)[:endLength]
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32)[:endLength]
        mb_actions = np.asarray(mb_actions, dtype=np.float32)[:endLength]
        mb_values = np.asarray(mb_values, dtype=np.float32)
        mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32)[:endLength]
        mb_dones = np.asarray(mb_dones, dtype=np.bool)
        #discount/bootstrap value function with generalized advantage estimation:
        mb_returns = np.zeros_like(mb_rewards)
        mb_advs = np.zeros_like(mb_rewards)
        for k in range(4):
            lastgaelam = 0
            for t in reversed(range(k, endLength, 4)):
                nextNonTerminal = 1.0 - mb_dones[t]
                nextValues = mb_values[t+4]
                delta = mb_rewards[t] + self.gamma * nextValues * nextNonTerminal - mb_values[t]
                mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextNonTerminal * lastgaelam
        
        mb_values = mb_values[:endLength]
        #mb_dones = mb_dones[:endLength]
        mb_returns = mb_advs + mb_values
        
        return map(sf01, (mb_obs, mb_availAcs, mb_returns, mb_actions, mb_values, mb_neglogpacs))
        
    def train(self, nTotalSteps):

        nUpdates = nTotalSteps // (self.nGames * self.nSteps)
        
        for update in range(nUpdates):
            
            alpha = 1.0 - update/nUpdates
            lrnow = self.learningRate * alpha
            if lrnow < self.minLearningRate:
                lrnow = self.minLearningRate
            cliprangenow = self.clipRange * alpha
            
            states, availAcs, returns, actions, values, neglogpacs = self.run()
            
            batchSize = states.shape[0]
            self.totTrainingSteps += batchSize
            
            nTrainingBatch = batchSize // self.nMiniBatches
            
            currParams = self.trainingNetwork.getParams()
            
            mb_lossvals = []
            inds = np.arange(batchSize)
            for _ in range(self.nOptEpochs):
                np.random.shuffle(inds)
                for start in range(0, batchSize, nTrainingBatch):
                    end = start + nTrainingBatch
                    mb_inds = inds[start:end]
                    mb_lossvals.append(self.trainingModel.train(lrnow, cliprangenow, states[mb_inds], availAcs[mb_inds], returns[mb_inds], actions[mb_inds], values[mb_inds], neglogpacs[mb_inds]))
            lossvals = np.mean(mb_lossvals, axis=0)
            self.losses.append(lossvals)
            
            newParams = self.trainingNetwork.getParams()
            needToReset = 0
            for param in newParams:
                if np.sum(np.isnan(param)) > 0:
                    needToReset = 1
                    
            if needToReset == 1:
                self.trainingNetwork.loadParams(currParams)
            
            if update % self.saveEvery == 0:
                name = "modelParameters" + str(update)
                self.trainingNetwork.saveParams(name)
                joblib.dump(self.losses,"losses.pkl")
                joblib.dump(self.epInfos, "epInfos.pkl")

In [None]:
print(tf.__version__)

1.15.2


### Main run

In [None]:
  with tf.Session() as sess:
      mainSim = GoForwardPPOSimulation(sess, nGames=64, nSteps=20, learningRate = 0.00025, clipRange = 0.2)
      start = time.time()
      mainSim.train(1000000000)
      end = time.time()
      print("Time Taken: %f" % (end-start))
      






Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

