# Experimenting on ants

In [1]:
from pathlib import Path

import gymnasium as gym
import matplotlib.pyplot as plt
import numpy as np
from mujoco import viewer
from stable_baselines3 import SAC


## Observing the observations

In [3]:
ant_xml = Path("assets/mujoco/models/ant-3.xml").resolve()
# env = gym.make('Ant-v4', ctrl_cost_weight=0.1, use_contact_forces=True, render_mode="human")
env = gym.make("Ant-v4", xml_file=str(ant_xml), render_mode="human")

In [3]:
observation, info = env.reset()

In [4]:
observation

array([ 0.66047925,  0.9971541 ,  0.06756732, -0.03341387, -0.00137028,
       -0.07984521,  0.04375994, -0.0096046 ,  0.01226555, -0.08926691,
       -0.02879555, -0.07376045,  0.04433621,  0.0964623 , -0.06174972,
       -0.14143607,  0.08411178,  0.08991813, -0.07854903, -0.18739163,
       -0.00394809, -0.05577037, -0.0477747 ])

In [5]:
s1, r1, terminated, truncated, inform = env.step([0]*6)

In [6]:
# plt.imshow(env.render()) # type: ignore

## Saccing

In [14]:
# Loading the model
model = SAC.load("sac_ant_3legs")

In [8]:
model = SAC("MlpPolicy", env, verbose=1, tensorboard_log="./runs")

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [9]:
model.learn(total_timesteps=1_000_000)

Logging to ./runs\SAC_1


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 155      |
|    ep_rew_mean     | -0.833   |
| time/              |          |
|    episodes        | 4        |
|    fps             | 53       |
|    time_elapsed    | 11       |
|    total_timesteps | 621      |
| train/             |          |
|    actor_loss      | -13.5    |
|    critic_loss     | 0.935    |
|    ent_coef        | 0.856    |
|    ent_coef_loss   | -1.56    |
|    learning_rate   | 0.0003   |
|    n_updates       | 520      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 254      |
|    ep_rew_mean     | -14.1    |
| time/              |          |
|    episodes        | 8        |
|    fps             | 52       |
|    time_elapsed    | 38       |
|    total_timesteps | 2029     |
| train/             |          |
|    actor_loss      | -26.1    |
|    critic_loss     | 1.6      |
|    ent_coef 

<stable_baselines3.sac.sac.SAC at 0x20add1135d0>

In [None]:
obs, info = env.reset()
for _ in range(1000):
    action, _states = model.predict(obs, deterministic=True) 
    obs, reward, terminated, truncated, info = env.step(action)
    if terminated or truncated:
        obs, info = env.reset()


In [12]:
# Saving the model
model.save("sac_ant_3legs")

# Loading the model
# model = SAC.load("sac_ant")


In [2]:
import os
import random
import time
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path

import gymnasium as gym
import numpy as np
import torch
import torch.nn.functional as F  # noqa: N812
from stable_baselines3.common.buffers import ReplayBuffer
from torch import nn, optim
from torch.utils.tensorboard import SummaryWriter  # type: ignore
from transformers import DistilBertConfig, DistilBertModel

In [None]:
from scripts.last_hope import Actor