# QL Agent #

Toy environment.

In [None]:
from tqdm import tqdm
import gym
import itertools
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys
from gym import error, spaces, utils
from gym.utils import seeding
from enum import Enum
import plotting
import collections
from itertools import combinations
  
matplotlib.style.use('default')

In [None]:
class Plant:
    def __init__(self, species, maturity=10):
        self.species = species
        self.maturity = maturity         # consider 'days_to_maturity'
        self.age = 0
        
    def __repr__(self):
        return "{}".format(self.species)
    
def truncate(n):
    return int(n * 100) / 100
class Field(gym.Env):
    #metadata = {'render.modes': ['human']}

    def __init__(self, size=5, sow_limit=200, season=120, calendar=0):
        # added to define action and observation spaces
        self.action_space = spaces.Box(np.array([0,0,0]),
                                       np.array([4,1,1]), dtype=np.float32)
        
        # reduced observation space to 1) standardize output and 2) remain consistent with genetic algo
        self.observation_space = spaces.Box(np.array([0,0,0]),np.array([360,360,360]),dtype=np.int64)

        # parameters for overall field character
        self.size = size
        self.sow_limit = sow_limit
        self.season = season
        self.calendar = calendar
        
        # constants for computing end-of-season reward---distances represent meters
        self.crowding_dist = .02
        self.maize_maize_dist = .1
        self.bean_support_dist = .1
        self.crowding_penalty = .1
        self.maize_maize_penalty = .9
        self.bean_support_bonus = .6
        
        ### self.observation_space = spaces.???
        self.observation = [0,0,0]
        # field is initialized by calling reset()
        self.field = None
    def step(self, action):
        #for choice in action:
        choice = action
        if int(choice[0]) == 0:
            self.field = np.append(self.field, [[self.size * truncate(choice[1]), 
                                              self.size * truncate(choice[2]), 
                                              Plant('Maize')]], axis=0)
            self.observation[0] += 1
        elif int(choice[0]) == 1:
            self.field = np.append(self.field, [[self.size * truncate(choice[1]), 
                                              self.size * truncate(choice[2]), 
                                              Plant('Bean')]], axis=0)
            self.observation[1] += 1
        elif int(choice[0]) == 2:
            self.field = np.append(self.field, [[self.size * truncate(choice[1]), 
                                              self.size * truncate(choice[2]), 
                                              Plant('Squash')]], axis=0)
            self.observation[2] += 1
        self.calendar +=1
        for plant in self.field:
            plant[2].age += 1
            
        done = self.calendar == self.season
            
        if not done:
            reward = 0
        else:
            reward = self.get_reward()
        #print(self.observation)
        return self.field, reward, done, {}
    
    def reset(self):
        # field is initialized with one random corn plant in order to make sowing (by np.append) work
        self.field = np.array([[self.size * np.random.random(), 
                                self.size * np.random.random(), 
                                Plant('Maize')]])
        # timekeeping is reset
        self.calendar = 0
        reward=0
        self.observation=[0,0,0]
        # added to avoid returning none type
        return self.field
        
    def render(self, mode='human'):
        # initialize plant type arrays so that pyplot won't break if any is empty
        maize = np.array([[None, None]])
        bean = np.array([[None, None]])
        squash = np.array([[None, None]])
        maize_imm = np.array([[None, None]])
        bean_imm = np.array([[None, None]])
        squash_imm = np.array([[None, None]])
        # replace initial arrays with coordinates for each plant type; imm are plants that haven't matured
        maize = np.array([row for row in self.field 
                            if row[2].__repr__() == 'Maize' and row[2].age >= row[2].maturity])
        if maize.size==0:
          maize = np.array([[None, None]])
        bean = np.array([row for row in self.field 
                            if row[2].__repr__() == 'Bean' and row[2].age >= row[2].maturity])
        if bean.size==0:
          bean = np.array([[None, None]])
        squash = np.array([row for row in self.field 
                              if row[2].__repr__() == 'Squash' and row[2].age >= row[2].maturity])
        if squash.size==0:
          squash = np.array([[None, None]])
        maize_imm = np.array([row for row in self.field 
                            if row[2].__repr__() == 'Maize' and row[2].age < row[2].maturity])
        if maize_imm.size==0:
          maize_imm = np.array([[None, None]])
        bean_imm = np.array([row for row in self.field 
                            if row[2].__repr__() == 'Bean' and row[2].age < row[2].maturity])
        if bean_imm.size==0:
          bean_imm = np.array([[None, None]])
        squash_imm = np.array([row for row in self.field 
                            if row[2].__repr__() == 'Squash' and row[2].age < row[2].maturity])
        if squash_imm.size==0:
          squash_imm = np.array([[None, None]])
        # plot the field---currently breaks if any plant type is absent
        plt.figure(figsize=(10, 10))
        plt.scatter(maize[:,0], maize[:,1], c='green', s=200, marker = 'o', alpha=.5, edgecolor='#303030')
        plt.scatter(bean[:,0], bean[:,1], c='brown', s=150, marker = 'o', alpha=.5, edgecolor='#303030')
        plt.scatter(squash[:,0], squash[:,1], c='orange', s=400, marker = 'o', alpha=.5, edgecolor='#303030')
        plt.scatter(maize_imm[:,0], maize_imm[:,1], c='green', s=200, marker = 'o', alpha=.1, edgecolor='#303030')
        plt.scatter(bean_imm[:,0], bean_imm[:,1], c='brown', s=200, marker = 'o', alpha=.1, edgecolor='#303030')
        plt.scatter(squash_imm[:,0], squash_imm[:,1], c='orange', s=200, marker = 'o', alpha=.1, edgecolor='#303030')

        plt.show()
        
        print("Total yield in Calories is {}.\n---\n".format(round(self.get_reward(), 1)))
    
    def close(self):
        # unneeded right now? AFAICT this is only used to shut down realtime movie visualizations
        pass
    
    def get_reward(self):
        # array of plant coordinates for computing distances
        xy_array = np.array([[row[0], row[1]] for row in self.field])

        # distances[m,n] is distance from mth to nth plant in field
        distances = np.linalg.norm(xy_array - xy_array[:,None], axis=-1)
        
        reward = 0
        i = 0
        while i < len(self.field):
            if self.field[i,2].age < self.field[i,2].maturity:
                reward += 0
            elif self.field[i,2].__repr__() == 'Maize':
                cal = 1
                j = 0
                while j < len(distances[0]):
                    if (self.field[j,2].__repr__() == 'Bean' 
                            and distances[i,j] < self.bean_support_dist):
                        cal += self.bean_support_bonus
                    if (self.field[j,2].__repr__() == 'Maize' 
                            and i !=j 
                            and distances[i,j] < self.maize_maize_dist):
                        cal *= self.maize_maize_penalty
                    if 0 < distances[i,j] < self.crowding_dist:
                        cal *= self.crowding_penalty
                    j += 1
                reward += cal
            elif self.field[i,2].__repr__() == 'Bean':
                reward += .25
            elif self.field[i,2].__repr__() == 'Squash':
                reward += 3
            i += 1        
        return reward


In [None]:
env = Field()

In [None]:
print(env.action_space)
print(env.observation_space)

In [None]:
# testing empty environment
env.reset()
env.render()

Testing environment output using randomized action space

In [None]:
sum = 0
for i in tqdm(range(100)):  
  episodes = 50 
  rewardRandomList=[]
  for episode in range(1, episodes+1):
      state = env.reset()
      done = False
      score = 0 
      while not done:
          action = env.action_space.sample()
          n_state, reward, done, info = env.step(action)
          score+=reward
      rewardRandomList.append(env.get_reward())
  sum += max(rewardRandomList)
print(sum/100)

In [None]:

state = env.reset()
done = False
score = 0 
while not done:
    action = env.action_space.sample()
    n_state, reward, done, info = env.step(action)
    score+=reward
env.render()

In [None]:
states = env.observation_space.shape
actions = env.action_space.shape

In [None]:
print(states)
print(actions)

Begin Q-Learning Agent

In [None]:
# parameters
episodes = 50
discount_factor = 1.3
alpha = 0.8

In [None]:
# intializing Q Table
actionToIndex= {}
indexToAction={}
cnt = 0
for i in range(3):
  for j in range(100):
    for k in range(100):
      action = [i,j/100, k/100]
      actionToIndex[str(action)]=cnt
      indexToAction[cnt]=action
      cnt += 1


In [None]:
# Filling Q Table
Q=[]
for i in range(120):
   temp = []
   for j in range(len(actionToIndex)):
     temp.append(np.random.rand())
   Q.append(temp)

In [None]:
# Defining activation function
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

In [None]:
# Running QL
actionSet = {}
rewardList = []
for i in tqdm(range(episodes)):
  env.reset()
  temp =0
  #print("iter: " + str(i))
  for j in range(120):
    #choose an action
    actionIndex = np.argmax(softmax(Q[j])) 
    temp+=actionIndex
    observation, reward, done, _ = env.step(indexToAction[actionIndex])
    if j < 118:
      best_next_action = np.argmax(Q[j+1])
      td_target = reward + discount_factor * Q[j+1][best_next_action]
      td_delta = td_target - Q[j][actionIndex]
      before = Q[j][actionIndex]
      Q[j][actionIndex]+= alpha * td_delta
      print("j: " + str(j) + " actionIndex: " + str(actionIndex)+ " action: " + str(indexToAction[actionIndex]) + " before: " + str(before))
      print(Q[j][actionIndex])
  rewardList.append(env.get_reward())
  actionSet[i]=temp
  
  #env.render()


In [None]:
# Visualizing reward over time
matplotlib.pyplot.plot(range(episode), rewardList )

In [None]:
#  Rendering ptimal actions
env.reset()
for i in range(len(Q)):
  action = np.argmax(Q[i])
  env.step(indexToAction[action])
env.render()