In [1]:
# !pip install matplotlib
# !pip install gymnasium==0.28.1
# !pip install stable_baselines3==2.1.0
# !pip install tqdm

# !pip install gym
# !pip install stable_baselines
# !pip install tensorflow
# !pip install shimmy>=0.2.1

In [2]:
import numpy as np
from copy import deepcopy
import matplotlib.pyplot as plt
from tqdm import tqdm

np.set_printoptions(precision=3)

In [3]:
class Agent:
    def __init__(self):
        pass
    
    def act(self, state):
        # Simple-minded agent that always select action 1
        return 1

In [4]:
class Car:
    def __init__(self, tyre="Intermediate"):
        self.default_tyre = tyre
        self.possible_tyres = ["Ultrasoft", "Soft", "Intermediate", "Fullwet"]
        self.pitstop_time = 23
        self.reset()
    
    
    def reset(self):
        self.change_tyre(self.default_tyre)
    
    
    def degrade(self, w, r):
        if self.tyre == "Ultrasoft":
            self.condition *= (1 - 0.0050*w - (2500-r)/90000)
        elif self.tyre == "Soft":
            self.condition *= (1 - 0.0051*w - (2500-r)/93000)
        elif self.tyre == "Intermediate":
            self.condition *= (1 - 0.0052*abs(0.5-w) - (2500-r)/95000)
        elif self.tyre == "Fullwet":
            self.condition *= (1 - 0.0053*(1-w) - (2500-r)/97000)
        
        
    def change_tyre(self, new_tyre):
        assert new_tyre in self.possible_tyres
        self.tyre = new_tyre
        self.condition = 1.00
    
    
    def get_velocity(self):
        if self.tyre == "Ultrasoft":
            vel = 80.7*(0.2 + 0.8*self.condition**1.5)
        elif self.tyre == "Soft":
            vel = 80.1*(0.2 + 0.8*self.condition**1.5)
        elif self.tyre == "Intermediate":
            vel = 79.5*(0.2 + 0.8*self.condition**1.5)
        elif self.tyre == "Fullwet":
            vel = 79.0*(0.2 + 0.8*self.condition**1.5)
        return vel

    
class Track:
    def __init__(self, car=Car()):
        # self.radius and self.cur_weather are defined in self.reset()
        self.total_laps = 162
        self.car = car
        self.possible_weather = ["Dry", "20% Wet", "40% Wet", "60% Wet", "80% Wet", "100% Wet"]
        self.wetness = {
            "Dry": 0.00, "20% Wet": 0.20, "40% Wet": 0.40, "60% Wet": 0.60, "80% Wet": 0.80, "100% Wet": 1.00
        }
        self.p_transition = {
            "Dry": {
                "Dry": 0.987, "20% Wet": 0.013, "40% Wet": 0.000, "60% Wet": 0.000, "80% Wet": 0.000, "100% Wet": 0.000
            },
            "20% Wet": {
                "Dry": 0.012, "20% Wet": 0.975, "40% Wet": 0.013, "60% Wet": 0.000, "80% Wet": 0.000, "100% Wet": 0.000
            },
            "40% Wet": {
                "Dry": 0.000, "20% Wet": 0.012, "40% Wet": 0.975, "60% Wet": 0.013, "80% Wet": 0.000, "100% Wet": 0.000
            },
            "60% Wet": {
                "Dry": 0.000, "20% Wet": 0.000, "40% Wet": 0.012, "60% Wet": 0.975, "80% Wet": 0.013, "100% Wet": 0.000
            },
            "80% Wet": {
                "Dry": 0.000, "20% Wet": 0.000, "40% Wet": 0.000, "60% Wet": 0.012, "80% Wet": 0.975, "100% Wet": 0.013
            },
            "100% Wet": {
                "Dry": 0.000, "20% Wet": 0.000, "40% Wet": 0.000, "60% Wet": 0.000, "80% Wet": 0.012, "100% Wet": 0.988
            }
        }
        self.reset()
    
    
    def reset(self):
        # self.radius = np.random.randint(600,1201)
        self.radius = 900
        self.cur_weather = np.random.choice(self.possible_weather)
        self.is_done = False
        self.pitstop = False
        self.laps_cleared = 0
        self.car.reset()
        return self._get_state()
    
    
    def _get_state(self):
        return [self.car.tyre, self.car.condition, self.cur_weather, self.radius, self.laps_cleared]
        
    
    def transition(self, action=0):
        """
        Args:
            action (int):
                0. Make a pitstop and fit new ‘Ultrasoft’ tyres
                1. Make a pitstop and fit new ‘Soft’ tyres
                2. Make a pitstop and fit new ‘Intermediate’ tyres
                3. Make a pitstop and fit new ‘Fullwet’ tyres
                4. Continue the next lap without changing tyres
        """
        ## Pitstop time will be added on the first eight of the subsequent lap
        if self.laps_cleared - int(self.laps_cleared) == 0.75 and self.laps_cleared < 161.75:
            total_rewards = 0
            for _ in range(8):
                time_taken = 0
                if self.laps_cleared == int(self.laps_cleared):
                    if self.pitstop:
                        self.car.change_tyre(self.committed_tyre)
                        time_taken += self.car.pitstop_time
                        self.pitstop = False
                
                ## The environment is coded such that only an action taken at the start of the three-quarters mark of each lap matters
                if self.laps_cleared - int(self.laps_cleared) == 0.75:
                    if action < 4:
                        self.pitstop = True
                        self.committed_tyre = self.car.possible_tyres[action]
                    else:
                        self.pitstop = False
                
                self.cur_weather = np.random.choice(
                    self.possible_weather, p=list(self.p_transition[self.cur_weather].values())
                )
                # we assume that degration happens only after a car has travelled the one-eighth lap
                velocity = self.car.get_velocity()
                time_taken += (2*np.pi*self.radius/8) / velocity
                reward = 0 - time_taken
                self.car.degrade(
                    w=self.wetness[self.cur_weather], r=self.radius
                )
                self.laps_cleared += 0.125
                
                if self.laps_cleared == self.total_laps:
                    self.is_done = True
                
                total_rewards += reward
                next_state = self._get_state()
                
            return total_rewards, next_state, self.is_done, velocity
        
        else:
            time_taken = 0
            if self.laps_cleared == int(self.laps_cleared):
                if self.pitstop:
                    self.car.change_tyre(self.committed_tyre)
                    time_taken += self.car.pitstop_time
                    self.pitstop = False
            
            ## The environment is coded such that only an action taken at the start of the three-quarters mark of each lap matters
            if self.laps_cleared - int(self.laps_cleared) == 0.75:
                if action < 4:
                    self.pitstop = True
                    self.committed_tyre = self.car.possible_tyres[action]
                else:
                    self.pitstop = False
            
            self.cur_weather = np.random.choice(
                self.possible_weather, p=list(self.p_transition[self.cur_weather].values())
            )
            # we assume that degration happens only after a car has travelled the one-eighth lap
            velocity = self.car.get_velocity()
            time_taken += (2*np.pi*self.radius/8) / velocity
            reward = 0 - time_taken
            self.car.degrade(
                w=self.wetness[self.cur_weather], r=self.radius
            )
            self.laps_cleared += 0.125
            
            if self.laps_cleared == self.total_laps:
                self.is_done = True
            
            next_state = self._get_state()
            return reward, next_state, self.is_done, velocity

In [5]:
new_car = Car()
env = Track(new_car)

agent = Agent()

In [6]:
state = env.reset()    
done = False
G = 0
while not done:
    action = agent.act(state)
    reward, next_state, done, velocity = env.transition(action)
    # added velocity for sanity check
    state = deepcopy(next_state)
    G += reward

print("G: %d" % G)

G: -16234


### Race 1

In [7]:
# Store historical weather
holder = []

state = env.reset()
start_state = deepcopy(state)
done = False
G = 0
while not done:
    holder.append(env.cur_weather)
    action = agent.act(state)
    reward, next_state, done, velocity = env.transition(action)
    # added velocity for sanity check
    state = deepcopy(next_state)
    G += reward


In [8]:
group1 = Agent()
group2 = Agent()
group3 = Agent()
group4 = Agent()
group5 = Agent()
group6 = Agent()
group7 = Agent()
group8 = Agent()
group9 = Agent()

In [9]:
# Race 1
start_weather, radius = start_state[2], start_state[3]

for agent in [group1, group2, group3, group4, group5, group6, group7, group8, group9]:
    state = env.reset() 
    env.cur_weather = start_weather   # assert common start weather
    env.radius = radius               # assert common track radius
    done = False
    G = 0
    i = 0
    while not done:
        env.cur_weather = holder[i]   # assert weather transition
        action = agent.act(state)
        reward, next_state, done, velocity = env.transition(action)
        # added velocity for sanity check
        state = deepcopy(next_state)
        G += reward
        i += 1
    
    print("G: %.2f" % G)

G: -16219.58
G: -16220.18
G: -16221.07
G: -16221.65
G: -16220.11
G: -16219.40
G: -16221.14
G: -16220.56
G: -16219.43


## Library Testing

Other algos to try:     
https://stable-baselines3.readthedocs.io/en/master/guide/algos.html 

In [10]:
import gym
from gym import spaces
import numpy as np

class RaceTrackEnv(gym.Env):
    def __init__(self):
        self.track = Track()
        
        # Action space: 5 discrete actions
        self.action_space = spaces.Discrete(5)
        
        # Flattened observation space
        max_radius = 1200
        max_laps = self.track.total_laps
        self.observation_space = spaces.Box(
            low=np.array([0, 0, 0, 600, 0]), 
            high=np.array([3, 1, 5, max_radius, max_laps]), 
            dtype=np.float32
        )
        
    def step(self, action):
        reward, next_state, done, velocity = self.track.transition(action)
        
        # Convert the tyre type from a string to an integer index for the observation space
        tyre_idx = float(self.track.car.possible_tyres.index(next_state[0]))
        weather_idx = float(self.track.possible_weather.index(next_state[2]))
        
        # Convert the observation to a flattened format
        obs = np.array([tyre_idx, next_state[1], weather_idx, float(next_state[3]), next_state[4]])
        # print(reward)
        return obs, reward, done, {"velocity": velocity}
        
    def reset(self):
        state = self.track.reset()
        tyre_idx = float(self.track.car.possible_tyres.index(state[0]))
        weather_idx = float(self.track.possible_weather.index(state[2]))
        return np.array([tyre_idx, 1.00, weather_idx, float(state[3]), state[4]])

    def render(self, mode="human"):
        # Simple rendering using print statements.
        state = self.track._get_state()
        # if int(state[4])==state[4]:
        #     print(f"Tyre: {state[0]}, Condition: {state[1]:.2f}, Weather: {state[2]}, Radius: {state[3]}, Laps Cleared: {state[4]:.2f}")
        print(f"Tyre: {state[0]}, Condition: {state[1]:.2f}, Weather: {state[2]}, Radius: {state[3]}, Laps Cleared: {state[4]:.2f}")
        
    def close(self):
        pass


In [11]:
# You can then use the environment as:
env = RaceTrackEnv()
obs = env.reset()
for _ in range(8*162): #1000 eighths of a lap
    action = env.action_space.sample() # Take random action as an example
    obs, reward, done, info = env.step(action)
    env.render()
    if done:
        break


Tyre: Intermediate, Condition: 0.98, Weather: 40% Wet, Radius: 900, Laps Cleared: 0.12
Tyre: Intermediate, Condition: 0.97, Weather: 40% Wet, Radius: 900, Laps Cleared: 0.25
Tyre: Intermediate, Condition: 0.95, Weather: 40% Wet, Radius: 900, Laps Cleared: 0.38
Tyre: Intermediate, Condition: 0.93, Weather: 40% Wet, Radius: 900, Laps Cleared: 0.50
Tyre: Intermediate, Condition: 0.92, Weather: 40% Wet, Radius: 900, Laps Cleared: 0.62
Tyre: Intermediate, Condition: 0.90, Weather: 40% Wet, Radius: 900, Laps Cleared: 0.75
Tyre: Fullwet, Condition: 0.89, Weather: 40% Wet, Radius: 900, Laps Cleared: 1.75
Tyre: Fullwet, Condition: 0.89, Weather: 60% Wet, Radius: 900, Laps Cleared: 2.75
Tyre: Fullwet, Condition: 0.77, Weather: 60% Wet, Radius: 900, Laps Cleared: 3.75
Tyre: Intermediate, Condition: 0.90, Weather: 60% Wet, Radius: 900, Laps Cleared: 4.75
Tyre: Ultrasoft, Condition: 0.89, Weather: 40% Wet, Radius: 900, Laps Cleared: 5.75
Tyre: Ultrasoft, Condition: 0.76, Weather: 40% Wet, Radius: 9

  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


## Model testing

In [12]:
# !pip install sb3-contrib

In [13]:
import os
from stable_baselines3 import A2C, DQN, PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from sb3_contrib import TQC, QRDQN, MaskablePPO, TRPO, ARS, RecurrentPPO
import pandas as pd

# Define the range of hyperparameters for grid search
learning_rates = [0.0001, 0.001, 0.01]
gammas = [0.95, 0.99, 0.995]

def create_and_train_model(algo_class, env, train_races, save_path, learning_rate, gamma=None):
    vec_env = DummyVecEnv([lambda: env])
    if algo_class==RecurrentPPO:
        model = algo_class("MlpLstmPolicy", vec_env, verbose=1, learning_rate=learning_rate, gamma=gamma)
    elif algo_class==ARS:
        model = algo_class("LinearPolicy", vec_env, verbose=1, learning_rate=learning_rate)
    else:
        model = algo_class("MlpPolicy", vec_env, verbose=1, learning_rate=learning_rate, gamma=gamma)
    model.learn(total_timesteps=168*train_races)
    model.save(save_path)
    return model

def evaluate_model(algo_class, model_filename, env, evalRaces):
    model = algo_class.load(model_filename)
    total_reward = 0
    obs = env.reset()
    for races in range(evalRaces):
        for _ in range(168):
            action, _states = model.predict(obs, deterministic=True)
            obs, reward, done, info = env.step(action)
            total_reward += reward
            env.render()
            if done:
                obs = env.reset()
                break
    return total_reward/evalRaces

# Initialize an empty list to collect the results
results = []

os.makedirs('gym_agents', exist_ok=True)
evalRaces = 1
env = RaceTrackEnv()
trainRaces_list = [1000]

# Process QRDQN
algo_class = QRDQN
algo_name = "QRDQN"
best_reward = float('-inf')  # Initialize with negative infinity

for trainRaces in trainRaces_list:
    for learning_rate in learning_rates:
        for gamma in gammas:
            model_filename = f'gym_agents/{algo_name}_race_track_filepath_{trainRaces}_{learning_rate}_{gamma}'
            try:
                model = algo_class.load(model_filename)
            except:
                model = create_and_train_model(algo_class, env, trainRaces, model_filename, learning_rate, gamma)
            r = evaluate_model(algo_class, model_filename, env, evalRaces)
            print(f'{algo_name} with {trainRaces} train races, lr={learning_rate}, gamma={gamma}: Average reward = {r}')
            if r > best_reward:
                best_reward = r
                best_model_filename = model_filename
            results.append({
                'algorithm': algo_name,
                'train_races': trainRaces,
                'learning_rate': learning_rate,
                'gamma': gamma,
                'average_reward': r
            })

# Save the best model
model.save(f'gym_agents/best_{algo_name}_race_track_filepath')


# Process TRPO
algo_class = TRPO
algo_name = "TRPO"
for trainRaces in trainRaces_list:
    for learning_rate in learning_rates:
        for gamma in gammas:
            model_filename = f'gym_agents/{algo_name}_race_track_filepath_{trainRaces}_{learning_rate}_{gamma}'
            try:
                model = algo_class.load(model_filename)
            except:
                model = create_and_train_model(algo_class, env, trainRaces, model_filename, learning_rate, gamma)
            r = evaluate_model(algo_class, model_filename, env, evalRaces)
            print(f'{algo_name} with {trainRaces} train races, lr={learning_rate}, gamma={gamma}: Average reward = {r}')
            if r > best_reward:
                best_reward = r
                best_model_filename = model_filename
            results.append({
                'algorithm': algo_name,
                'train_races': trainRaces,
                'learning_rate': learning_rate,
                'gamma': gamma,
                'average_reward': r
            })


# Save the best model
model.save(f'gym_agents/best_{algo_name}_race_track_filepath')

# Process ARS
algo_class = ARS
algo_name = "ARS"
for trainRaces in trainRaces_list:
    for learning_rate in learning_rates:
        model_filename = f'gym_agents/{algo_name}_race_track_filepath_{trainRaces}_{learning_rate}'
        try:
            model = algo_class.load(model_filename)
        except:
            model = create_and_train_model(algo_class, env, trainRaces, model_filename, learning_rate)
        r = evaluate_model(algo_class, model_filename, env, evalRaces)
        print(f'{algo_name} with {trainRaces} train races, lr={learning_rate}: Average reward = {r}')
        results.append({
            'algorithm': algo_name,
            'train_races': trainRaces,
            'learning_rate': learning_rate,
            'gamma': None,
            'average_reward': r
        })

# Save the best model
model.save(f'gym_agents/best_{algo_name}_race_track_filepath')

# Process RecurrentPPO
algo_class = RecurrentPPO
algo_name = "RecurrentPPO"
for trainRaces in trainRaces_list:
    for learning_rate in learning_rates:
        for gamma in gammas:
            model_filename = f'gym_agents/{algo_name}_race_track_filepath_{trainRaces}_{learning_rate}_{gamma}'
            try:
                model = algo_class.load(model_filename)
            except:
                model = create_and_train_model(algo_class, env, trainRaces, model_filename, learning_rate, gamma)
                
            r = evaluate_model(algo_class, model_filename, env, evalRaces)
            print(f'{algo_name} with {trainRaces} train races, lr={learning_rate}, gamma={gamma}: Average reward = {r}')
            if r > best_reward:
                best_reward = r
                best_model_filename = model_filename
            results.append({
                'algorithm': algo_name,
                'train_races': trainRaces,
                'learning_rate': learning_rate,
                'gamma': gamma,
                'average_reward': r
            })

# Save the best model
model.save(f'gym_agents/best_{algo_name}_race_track_filepath')

# Convert the results list to a DataFrame
df_results = pd.DataFrame(results)

# Save the DataFrame to a CSV file for easy viewing
df_results.to_csv('gym_agents/hyperparameter_tuning_results.csv', index=False)

# Optionally, display the DataFrame
print(df_results)

  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


Tyre: Intermediate, Condition: 0.98, Weather: 80% Wet, Radius: 900, Laps Cleared: 0.12
Tyre: Intermediate, Condition: 0.96, Weather: 80% Wet, Radius: 900, Laps Cleared: 0.25
Tyre: Intermediate, Condition: 0.95, Weather: 80% Wet, Radius: 900, Laps Cleared: 0.38
Tyre: Intermediate, Condition: 0.93, Weather: 80% Wet, Radius: 900, Laps Cleared: 0.50
Tyre: Intermediate, Condition: 0.91, Weather: 80% Wet, Radius: 900, Laps Cleared: 0.62
Tyre: Intermediate, Condition: 0.89, Weather: 80% Wet, Radius: 900, Laps Cleared: 0.75
Tyre: Fullwet, Condition: 0.90, Weather: 80% Wet, Radius: 900, Laps Cleared: 1.75
Tyre: Fullwet, Condition: 0.90, Weather: 80% Wet, Radius: 900, Laps Cleared: 2.75
Tyre: Fullwet, Condition: 0.90, Weather: 80% Wet, Radius: 900, Laps Cleared: 3.75
Tyre: Fullwet, Condition: 0.91, Weather: 100% Wet, Radius: 900, Laps Cleared: 4.75
Tyre: Fullwet, Condition: 0.91, Weather: 100% Wet, Radius: 900, Laps Cleared: 5.75
Tyre: Fullwet, Condition: 0.91, Weather: 100% Wet, Radius: 900, La

In [14]:
results

[{'algorithm': 'QRDQN',
  'train_races': 1000,
  'learning_rate': 0.0001,
  'gamma': 0.95,
  'average_reward': -16213.62055292643},
 {'algorithm': 'QRDQN',
  'train_races': 1000,
  'learning_rate': 0.0001,
  'gamma': 0.99,
  'average_reward': -15985.347937899141},
 {'algorithm': 'QRDQN',
  'train_races': 1000,
  'learning_rate': 0.0001,
  'gamma': 0.995,
  'average_reward': -16194.596589715946},
 {'algorithm': 'QRDQN',
  'train_races': 1000,
  'learning_rate': 0.001,
  'gamma': 0.95,
  'average_reward': -16044.409284304047},
 {'algorithm': 'QRDQN',
  'train_races': 1000,
  'learning_rate': 0.001,
  'gamma': 0.99,
  'average_reward': -16208.131533600714},
 {'algorithm': 'QRDQN',
  'train_races': 1000,
  'learning_rate': 0.001,
  'gamma': 0.995,
  'average_reward': -16146.013381473542},
 {'algorithm': 'QRDQN',
  'train_races': 1000,
  'learning_rate': 0.01,
  'gamma': 0.95,
  'average_reward': -15996.048673882044},
 {'algorithm': 'QRDQN',
  'train_races': 1000,
  'learning_rate': 0.01,
 

In [15]:
class Track:
    def __init__(self, car=Car()):
        # self.radius and self.cur_weather are defined in self.reset()
        self.total_laps = 162
        self.car = car
        self.possible_weather = ["Dry", "20% Wet", "40% Wet", "60% Wet", "80% Wet", "100% Wet"]
        self.wetness = {
            "Dry": 0.00, "20% Wet": 0.20, "40% Wet": 0.40, "60% Wet": 0.60, "80% Wet": 0.80, "100% Wet": 1.00
        }
        self.p_transition = {
            "Dry": {
                "Dry": 0.987, "20% Wet": 0.013, "40% Wet": 0.000, "60% Wet": 0.000, "80% Wet": 0.000, "100% Wet": 0.000
            },
            "20% Wet": {
                "Dry": 0.012, "20% Wet": 0.975, "40% Wet": 0.013, "60% Wet": 0.000, "80% Wet": 0.000, "100% Wet": 0.000
            },
            "40% Wet": {
                "Dry": 0.000, "20% Wet": 0.012, "40% Wet": 0.975, "60% Wet": 0.013, "80% Wet": 0.000, "100% Wet": 0.000
            },
            "60% Wet": {
                "Dry": 0.000, "20% Wet": 0.000, "40% Wet": 0.012, "60% Wet": 0.975, "80% Wet": 0.013, "100% Wet": 0.000
            },
            "80% Wet": {
                "Dry": 0.000, "20% Wet": 0.000, "40% Wet": 0.000, "60% Wet": 0.012, "80% Wet": 0.975, "100% Wet": 0.013
            },
            "100% Wet": {
                "Dry": 0.000, "20% Wet": 0.000, "40% Wet": 0.000, "60% Wet": 0.000, "80% Wet": 0.012, "100% Wet": 0.988
            }
        }
        self.reset()
    
    
    def reset(self):
        self.radius = np.random.randint(600,1201)
        # self.radius = 900
        self.cur_weather = np.random.choice(self.possible_weather)
        self.is_done = False
        self.pitstop = False
        self.laps_cleared = 0
        self.car.reset()
        return self._get_state()
    
    
    def _get_state(self):
        return [self.car.tyre, self.car.condition, self.cur_weather, self.radius, self.laps_cleared]
        
    
    def transition(self, action=0):
        """
        Args:
            action (int):
                0. Make a pitstop and fit new ‘Ultrasoft’ tyres
                1. Make a pitstop and fit new ‘Soft’ tyres
                2. Make a pitstop and fit new ‘Intermediate’ tyres
                3. Make a pitstop and fit new ‘Fullwet’ tyres
                4. Continue the next lap without changing tyres
        """
        ## Pitstop time will be added on the first eight of the subsequent lap
        if self.laps_cleared - int(self.laps_cleared) == 0.75 and self.laps_cleared < 161.75:
            total_rewards = 0
            for _ in range(8):
                time_taken = 0
                if self.laps_cleared == int(self.laps_cleared):
                    if self.pitstop:
                        self.car.change_tyre(self.committed_tyre)
                        time_taken += self.car.pitstop_time
                        self.pitstop = False
                
                ## The environment is coded such that only an action taken at the start of the three-quarters mark of each lap matters
                if self.laps_cleared - int(self.laps_cleared) == 0.75:
                    if action < 4:
                        self.pitstop = True
                        self.committed_tyre = self.car.possible_tyres[action]
                    else:
                        self.pitstop = False
                
                self.cur_weather = np.random.choice(
                    self.possible_weather, p=list(self.p_transition[self.cur_weather].values())
                )
                # we assume that degration happens only after a car has travelled the one-eighth lap
                velocity = self.car.get_velocity()
                time_taken += (2*np.pi*self.radius/8) / velocity
                reward = 0 - time_taken
                self.car.degrade(
                    w=self.wetness[self.cur_weather], r=self.radius
                )
                self.laps_cleared += 0.125
                
                if self.laps_cleared == self.total_laps:
                    self.is_done = True
                
                total_rewards += reward
                next_state = self._get_state()
                
            return total_rewards, next_state, self.is_done, velocity
        
        else:
            time_taken = 0
            if self.laps_cleared == int(self.laps_cleared):
                if self.pitstop:
                    self.car.change_tyre(self.committed_tyre)
                    time_taken += self.car.pitstop_time
                    self.pitstop = False
            
            ## The environment is coded such that only an action taken at the start of the three-quarters mark of each lap matters
            if self.laps_cleared - int(self.laps_cleared) == 0.75:
                if action < 4:
                    self.pitstop = True
                    self.committed_tyre = self.car.possible_tyres[action]
                else:
                    self.pitstop = False
            
            self.cur_weather = np.random.choice(
                self.possible_weather, p=list(self.p_transition[self.cur_weather].values())
            )
            # we assume that degration happens only after a car has travelled the one-eighth lap
            velocity = self.car.get_velocity()
            time_taken += (2*np.pi*self.radius/8) / velocity
            reward = 0 - time_taken
            self.car.degrade(
                w=self.wetness[self.cur_weather], r=self.radius
            )
            self.laps_cleared += 0.125
            
            if self.laps_cleared == self.total_laps:
                self.is_done = True
            
            next_state = self._get_state()
            return reward, next_state, self.is_done, velocity

In [17]:
import os
import pandas as pd
from stable_baselines3 import A2C, DQN, PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from sb3_contrib import TQC, QRDQN, MaskablePPO, TRPO, ARS, RecurrentPPO

# Load the best hyperparameters from the previous step
df_results = pd.read_csv('gym_agents/hyperparameter_tuning_results.csv')

# Update the function to accept total_timesteps as a parameter
def create_and_train_model(algo_class, env, total_timesteps, save_path, learning_rate, gamma=None):
    vec_env = DummyVecEnv([lambda: env])
    if algo_class==RecurrentPPO:
        model = algo_class("MlpLstmPolicy", vec_env, verbose=1, learning_rate=learning_rate, gamma=gamma)
    elif algo_class==ARS:
        model = algo_class("LinearPolicy", vec_env, verbose=1, learning_rate=learning_rate)
    else:
        model = algo_class("MlpPolicy", vec_env, verbose=1, learning_rate=learning_rate, gamma=gamma)
    model.learn(total_timesteps=total_timesteps)
    model.save(save_path)
    return model

# Create environment
env = RaceTrackEnv()

# Function to get the best hyperparameters for a given algorithm
def get_best_hyperparams(algo_name):
    best_row = df_results[df_results['algorithm'] == algo_name].sort_values('average_reward', ascending=False).iloc[0]
    return best_row['learning_rate'], best_row['gamma']

# Function to train and save the model with the best hyperparameters
def train_and_save_best_model(algo_class, algo_name):
    learning_rate, gamma = get_best_hyperparams(algo_name)
    model_filename = f'gym_agents/best_{algo_name}_10000_iterations'
    create_and_train_model(algo_class, env, 10000, model_filename, learning_rate, gamma)

# Train and save the best model for each algorithm
train_and_save_best_model(QRDQN, 'QRDQN')
train_and_save_best_model(TRPO, 'TRPO')
train_and_save_best_model(ARS, 'ARS')
train_and_save_best_model(RecurrentPPO, 'RecurrentPPO')



  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


Using cpu device
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.01     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 1874     |
|    time_elapsed     | 0        |
|    total_timesteps  | 676      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.01     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 2044     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1352     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.01     |
| time/               |          |
|    episodes         | 12       |
|    fps              | 2082     |
|    time_elapsed     | 0        |
|    total_timesteps  | 2028     |
----------------------------------
----------------------------------
| r