In [1]:
!pip install -e C:\Users\asus_strix_scar\Anaconda3\Lib\site-packages\gym\envs\steering-optimizer\steering_optimizer\envs\github\steering-optimizer\

Obtaining file:///C:/Users/asus_strix_scar/Anaconda3/Lib/site-packages/gym/envs/steering-optimizer/steering_optimizer/envs/github/steering-optimizer
Installing collected packages: steering-optimizer
  Found existing installation: steering-optimizer 0.0.1
    Uninstalling steering-optimizer-0.0.1:
      Successfully uninstalled steering-optimizer-0.0.1
  Running setup.py develop for steering-optimizer
Successfully installed steering-optimizer


In [2]:
import numpy as np

In [3]:
import gym

In [4]:
for env in gym.envs.registry.env_specs.keys():
     if 'steering_optimizer' in env:
          print('Remove {} from registry".format(env)')
          del gym.envs.registry.env_specs[env]

In [5]:
from gym.envs.registration import register

register(
    id='steering_optimizer-v0',
    entry_point='steering_optimizer.envs:StrOptEnv',# kwargs={'WB': 1900, 'TW': 1200, 'KP': 150,'tr_min': 4000}
)

In [6]:
env = gym.make("steering_optimizer-v0")

In [7]:
#Fixed parameters

#Wheelbase, track width and kingpin distance

WB = 1900
TW = 1200
KP = 150

In [8]:
batch_size = 50
session_size = 150
t_max = 300
percentile = 80
hidden_size = 15
learning_rate = 0.01
completion_score = 200

In [9]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class Net(nn.Module):
        
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.fc0 = nn.Linear(obs_size, hidden_size)
        self.fc1 = nn.Linear(hidden_size, n_actions)
        
    def forward(self, x):
        x = F.relu(self.fc0(x))
        return self.fc1(x)

    def generate_batch(self, env, batch_size, t_max):
        
        activation_1 = nn.Softmax(dim=1)
        batch_actions, batch_states, batch_rewards = [],[],[]
        # Reset only once in a batch
        s_0 = env.reset()
        
        for b in range(batch_size):
            states,actions = [],[]
            total_reward = 0
            
            #s = env.reset()
            # Imitating reset, with the same state for every batch member
            env.state = s_0
            
            s = s_0
            
            env.steps_beyond_done = None
            env.error = None
            env.max_r = None
            env.steps_since_reset = 0
            env.total_reward = 0
            
            #print('state shape after reset:', s.shape)
            s = s.reshape((1,4))
            
            #print('state shape after reshape:', s.shape)
            
            # step counter
            count = 0
            
            for t in range(t_max):
                count +=1
                
                # Scaling state to be within -1 and 1
                s = s/TW*2
                
                s_v = torch.FloatTensor([s]).reshape(((1,4)))
                #print('state_shape',s_v.shape)
                #s_v = torch.FloatTensor([s])
                
                #print('state: x',s_v)
                act_probs_v = activation_1(self.forward(s_v))
                act_probs = act_probs_v.data.numpy()[0]
                choice_num = len(act_probs)
                #print('num:',choice_num)
                a = np.random.choice(choice_num, p=act_probs)
                

                new_s, r, done, info = env.step(a)
                
                

                states.append(s)
                actions.append(a)
                
                #print('new state',new_s)
                
                total_reward += r
                
                #print('action:',a, 'reward: ', r, 'error:', env.error, 'total reward:', total_reward)
                
                # print('total reward:', total_reward)
                # Getting the new state               
                s = new_s/TW*2
                
                # After done, no more steps
                if done:
                    break
            
            batch_actions.append(actions)
            #print('Batch member done')
            #print('Batch actions:', batch_actions[-1])
            #print('total reward on episode:', total_reward)
            
            batch_states.append(states)
            batch_rewards.append(total_reward)
        
        #print('Batch generated')
        #print('States:', batch_states)
        #print('Episode Actions:', batch_actions)
        #print('Rewards:', batch_rewards)
        
        return batch_states, batch_actions, batch_rewards

    def filter_batch(self, states_batch, actions_batch, rewards_batch, percentile):
        
        reward_threshold = np.percentile(rewards_batch, percentile)
        
        #print('reward threshold:', reward_threshold)
        
        threshold_cnt = 0
        for i in range(len(rewards_batch)):
            if rewards_batch[i] > reward_threshold:
                threshold_cnt += 1
                
        elite_states = np.array([], dtype=float)
        elite_actions = np.array([], dtype=int)
        
        elite_indices = []
        
        for i in range(len(rewards_batch)):
            if rewards_batch[i] > reward_threshold:
                elite_indices.append(i)
        
        elite_indices = np.asarray(elite_indices)
        #print('indices', elite_indices)
        
        obs_num = np.shape(env.observation_space)[0]
        #print('obs num:', obs_num)
        
        for i in range(batch_size):
            for j in range(len(states_batch[i])):
                    
                if j == 0:
                    elite_states = np.append(elite_states, states_batch[i][0][0]) 
                    elite_actions = np.append(elite_actions, actions_batch[0][0])

                else:
                    elite_states = np.append(elite_states, states_batch[i][j])
                    elite_actions = np.append(elite_actions, actions_batch[i][j])

        return elite_states, elite_actions

In [10]:
n_states = np.shape(env.observation_space)[0]
n_actions = env.action_space.n

#print(n_states, n_actions)

#neural network
net = Net(n_states, hidden_size, n_actions)

In [None]:
#loss function
objective = nn.CrossEntropyLoss()

#optimisation function
optimizer = optim.Adam(params=net.parameters(), lr=learning_rate)

stats1 = np.array([])
stats2 = np.array([])
stats3 = np.array([])

for i in range(session_size):
    #generate new sessions
    batch_states,batch_actions,batch_rewards = net.generate_batch(env, batch_size, t_max)
    
    #print('batch_states', np.asarray(batch_states).reshape(-1,4))

    elite_states, elite_actions = net.filter_batch(batch_states,batch_actions,batch_rewards,percentile)
    
    #print('el_states', elite_states)
    elite_states = elite_states.reshape(-1,4)
    
    #print('el_states: reshaped', elite_states)
    #print('el_actions', elite_actions)
    
    optimizer.zero_grad()

    tensor_states = torch.FloatTensor(elite_states)
    tensor_actions = torch.LongTensor(elite_actions)

    action_scores_v = net(tensor_states)
    
    #print(action_scores_v)
    
    loss_v = objective(action_scores_v, tensor_actions)
    loss_v.backward()
    optimizer.step()

    #show results
    mean_reward, threshold = np.mean(batch_rewards), np.percentile(batch_rewards, percentile)
    print("%d: loss=%.5f, reward_mean=%.5f, reward_threshold=%.5f"% (i, loss_v.item(), mean_reward, threshold))
    
    stats1 = np.append(stats1, loss_v.item())
    stats2 = np.append(stats2, mean_reward)
    stats3 = np.append(stats3, threshold)
    
    #check if 
    if np.mean(batch_rewards)> completion_score:
        print("Environment has been successfullly completed!")

0: loss=1.08580, reward_mean=0.23436, reward_threshold=0.34350
1: loss=1.08238, reward_mean=0.29088, reward_threshold=0.40490
2: loss=1.07683, reward_mean=0.32390, reward_threshold=0.49200
3: loss=1.07255, reward_mean=0.38574, reward_threshold=0.49210
4: loss=1.07077, reward_mean=0.39582, reward_threshold=0.47930
5: loss=1.06514, reward_mean=0.48648, reward_threshold=0.58150
6: loss=1.05741, reward_mean=0.52194, reward_threshold=0.61600
7: loss=1.05715, reward_mean=0.54054, reward_threshold=0.63810
8: loss=1.05186, reward_mean=0.56766, reward_threshold=0.69220


In [None]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline  

plt.figure()
plt.plot(stats1)
plt.figure()
plt.plot(stats2)
plt.figure()
plt.plot(stats3)