In [1]:
import numpy as np
import pandas as pd
import gc
import warnings
import feather
import itertools
import multiprocessing
from copy import deepcopy
from tqdm import tqdm
from pathlib import Path
from matplotlib import pyplot as plt
from typing import *

import math
import random
from collections import namedtuple, deque

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

pd.options.display.max_rows = 200
pd.options.display.max_columns = 400

warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cpu")

In [3]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'reward', 'logprob', 'is_end'))


class ReplayMemory(object):

    def __init__(self):
        self.memory = deque()

    def push(self, state: torch.Tensor, action: torch.Tensor, reward: torch.Tensor, logprob: torch.Tensor, is_end: bool) -> None:
        """Save a transition"""
        self.memory.append(Transition(state, action, reward, logprob, is_end))
    
    def extend(self, memory: "ReplayMemory") -> None:
        self.memory.extend(memory.memory)

    def __len__(self) -> int:
        return len(self.memory)
    
    def clear(self) -> None:
        self.memory = deque()
    
    def get_batches(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        batch = Transition(*zip(*self.memory))
        
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        logprob_batch = torch.cat(batch.logprob)
        is_end_batch = np.vstack(batch.is_end)
        
        return state_batch, action_batch, reward_batch, logprob_batch, is_end_batch

In [4]:
class Actor(nn.Module):
    def __init__(self, input_dim: int, output_dim: int, clips: List[Dict[str, Any]]):
        super(Actor, self).__init__()
        self.clips = clips
        
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.Tanh(),
            nn.Linear(64, 64),
            nn.Tanh(),
            nn.Linear(64, output_dim)
        )
        self.transforms = {
            "softmax": nn.Softmax(dim=1),
            "tanh": nn.Tanh(),
        }
    
    def forward(self, x):
        x = self.net(x)
        for clip in self.clips:
            x[:, clip["dims"]] = clip["coef"] * self.transforms[clip["name"]](x[:, clip["dims"]])
        return x


class ActorCritic(nn.Module):
    def __init__(self, state_dim: int, action_dim: int, device: torch.device, spread: float):
        super(ActorCritic, self).__init__()
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.mean_dim = action_dim
        self.cov_dim = action_dim
        # self.cov_dim = (action_dim + 1) * action_dim // 2
        
        self.device = device

        self.actor = Actor(
            input_dim=state_dim, 
            output_dim=self.mean_dim + self.cov_dim,
            clips=[
                {"dims": [0], "coef": spread, "name": "tanh"},
                # {"dims": [1, 2, 3], "coef": 1, "name": "softmax"},
                {"dims": [4], "coef": spread / 2, "name": "tanh"},
            ]
        )
        
        self.critic = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.Tanh(),
            nn.Linear(64, 64),
            nn.Tanh(),
            nn.Linear(64, 1)
        )
        
        self.softplus = nn.Softplus()
    
    def forward(self):
        raise NotImplementedError()
    
    def act(self, state: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        dist = self._get_normal_dist(state=state)
        
        action = dist.sample()
        return action.detach(), dist.log_prob(action).detach()
    
    def deterministical_act(self, state: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        embed = self.actor(state)
        _mean = embed[:, :self.action_dim]
        return _mean.detach()
    
    def evaluate(self, state: torch.Tensor, action: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        dist = self._get_normal_dist(state=state)
        
        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        state_values = self.critic(state)
        
        return action_logprobs, state_values, dist_entropy
        
    def _get_normal_dist(self, state: torch.Tensor) -> torch.distributions.MultivariateNormal:
        embed = self.actor(state)

        _mean = embed[:, :self.action_dim]
        _cov_as_vec = embed[0, self.action_dim:]
        _cov = torch.diag(_cov_as_vec) * 1e-3 ###
#         _cov = torch.zeros((self.action_dim, self.action_dim), dtype=torch.float32)
        
#         tmp = 0
#         for i in range(self.action_dim):
#             for j in range(i, self.action_dim):
#                 if i == j:
#                     _cov[i, j] = self.softplus(_cov_as_vec[tmp + j])
#                 else:
#                     _cov[i, j] = self.softplus(_cov_as_vec[tmp + j])
#             tmp += (self.action_dim - i - 1)
        _cov = torch.mm(_cov, torch.t(_cov))    
        return torch.distributions.MultivariateNormal(loc=_mean, covariance_matrix=_cov)
    

class PPO(object):
    def __init__(self, state_dim: int, action_dim: int, device: torch.device, params: Dict[str, Any]):
        self.device = device
        
        self.gamma = params["GAMMA"]
        self.eps_clip = params["EPS_CLIP"]
        self.K_epochs = params["K_EPOCHS"]
        
        self.memory = ReplayMemory()
        
        self.policy = ActorCritic(state_dim=state_dim, action_dim=action_dim, device=device, spread=params["SPREAD"]).to(device)
        self.optimizer = torch.optim.Adam([
            {"params": self.policy.actor.parameters(), "lr": params["LR_ACTOR"]},
            {"params": self.policy.critic.parameters(), "lr": params["LR_CRITIC"]},
        ])
        
        self.policy_old = ActorCritic(state_dim=state_dim, action_dim=action_dim, device=device, spread=params["SPREAD"]).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())
        
        self.MseLoss = nn.MSELoss()
    
    def select_action(self, state: torch.Tensor) -> Tuple[torch.Tensor, float]:
        with torch.no_grad():
            action, action_logprob = self.policy_old.act(state=state)
            return action, action_logprob
    
    def select_deterministical_action(self, state: torch.Tensor) -> torch.Tensor:
        with torch.no_grad():
            action = self.policy_old.deterministical_act(state=state)
            return action
        
    def store_transition(
        self, 
        state: torch.Tensor, 
        action: torch.Tensor,
        reward: torch.Tensor, 
        logprob: torch.Tensor,
        is_end: bool,
    ) -> None:
        self.memory.push(state=state, action=action, reward=reward, logprob=logprob, is_end=is_end)
    
    def merge_memories(self, memories: List[ReplayMemory]) -> None:
        self.memory = ReplayMemory()
        for _mem in memories:
            self.memory.extend(_mem)
    
    def update(self):
        state_batch, action_batch, reward_batch, logprob_batch, is_end_batch = self.memory.get_batches()
        
        old_rewards = self._calc_cumsum_discount_rewards(reward_batch=reward_batch, is_end_batch=is_end_batch)
        
        old_states = state_batch.detach().to(self.device)
        old_actions = action_batch.detach().to(self.device)
        old_logprobs = logprob_batch.detach().to(self.device)
        
        for _ in range(self.K_epochs):
            logprobs, state_values, dist_entropy = self.policy.evaluate(state=old_states, action=old_actions)
            state_values = torch.squeeze(state_values)
            
            ratios = torch.exp(logprobs - old_logprobs.detach())
            
            advantages = old_rewards - state_values.detach()   
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages
            
            loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(state_values, old_rewards) - 0.01 * dist_entropy
            
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()
        
        self.policy_old.load_state_dict(self.policy.state_dict())
        
        self.memory.clear()
    
    def _calc_cumsum_discount_rewards(self, reward_batch: torch.Tensor, is_end_batch: np.ndarray) -> torch.Tensor:
        reward_batch = reward_batch.squeeze()
        is_end_batch = is_end_batch.squeeze()
        cumsum_discounted_rewards = deque()
        _discounted_reward = 0
        for reward, is_end in zip(reversed(reward_batch), reversed(is_end_batch)):
            if is_end:
                _discounted_reward = 0
            _discounted_reward = reward + self.gamma * _discounted_reward
            cumsum_discounted_rewards.appendleft(_discounted_reward)
        cumsum_discounted_rewards = torch.tensor(cumsum_discounted_rewards, dtype=torch.float32).to(self.device)
        
        def normalize(x: torch.Tensor) -> torch.Tensor:
            return (x - x.mean()) / (x.std() + 1e-7)
                
        return normalize(x=cumsum_discounted_rewards)
    
    def save_model(self, filepath: Path) -> None:
        torch.save(self.policy_old.state_dict(), filepath)

In [5]:
rootdir = Path().resolve().parent.parent
datadir = rootdir / "data" / "bybit" / "2022-07-24"
cachedir = rootdir / "data" / "cache" / "ppo"

In [6]:
dfs = list()
for i in range(1, 10):
    dfs.append(feather.read_dataframe(datadir / f"data_{i}.feather"))
df = pd.concat(dfs).reset_index(drop=True)
del dfs
gc.collect()

df = df[["open_time", "close", "high", "low"]].astype(float).astype(int)

df.columns = ["timestamp", "price", "max_price", "min_price"]
df[["buy_price", "sell_price"]] = df[["max_price", "min_price"]]

df.head()

Unnamed: 0,timestamp,price,max_price,min_price,buy_price,sell_price
0,1609426800,28490,28490,28339,28490,28339
1,1609427100,28569,28569,28490,28569,28490
2,1609427400,28622,28623,28539,28623,28539
3,1609427700,28559,28626,28532,28626,28532
4,1609428000,28598,28620,28559,28620,28559


In [7]:
def add_features(df):
    df["_diff"] = df["price"].diff()
    df["spread_upper"] = df["max_price"] / df["price"] - 1
    df["spread_lower"] = df["min_price"] / df["price"] - 1
    
    for minutes in [1, 2]:
        nm_dsharp, nm_p, nm_pcs, nm_area, nm_maxval, nm_minval, nm_maxlen, nm_minlen, nm_change = [
            f"{nm}_{minutes}" for nm in ["dsharp", "_p", "_pcs", "area", "maxval", "minval", "maxlen", "minlen", "change"]
        ]
        
        # 微分Sharp比
        df[nm_dsharp] = df["_diff"].rolling(minutes * 6).mean() / (df["_diff"].rolling(minutes * 6).std() + 1.0)

        df[nm_p] = find_cross_zero(x=df[nm_dsharp].values)
        df[nm_pcs] = df[nm_p].cumsum()

        _values = np.empty((df.shape[0], 4))
        for i, (price, ds, p, pcs) in enumerate(df[["price", nm_dsharp, nm_p, nm_pcs]].values):    
            sign = np.sign(ds)
            if pcs == 0:
                mt = {
                    nm_area: np.nan, nm_maxval: np.nan, nm_minval: np.nan,
                    nm_maxlen: np.nan, nm_minlen: np.nan,
                }
            else:
                if p:
                    mt = {
                        nm_area: 0, nm_maxval: -np.inf, nm_minval: np.inf,
                        nm_maxlen: 0, nm_minlen: 0,
                    }
                mt[nm_area] += sign * ds
                if ds > mt[nm_maxval]:
                    mt[nm_maxlen] = 0
                    mt[nm_maxval] = ds
                else:
                    mt[nm_maxlen] += 1
                if ds < mt[nm_minval]:
                    mt[nm_minlen] = 0
                    mt[nm_minval] = ds
                else:
                    mt[nm_minlen] += 1
            _values[i] = np.array([
                mt[nm_area],
                max(sign * mt[nm_maxval], sign * mt[nm_minval]),
                mt[nm_maxlen],
                mt[nm_minlen],
            ])
        df[[nm_area, nm_change, nm_maxlen, nm_minlen]] = _values
    
    df = df.dropna().reset_index(drop=True)
    return df


def find_cross_zero(x: np.ndarray) -> np.ndarray:
    x_len = x.shape[0] - 1
    y = np.zeros(x.shape[0]).astype(bool)
    for i in range(x.shape[0] - 1):
        if (x[x_len - i] > 0) and x[x_len - i - 1] <= 0:
            y[x_len - i] = True
        elif (x[x_len - i] < 0) and x[x_len - i - 1] >= 0:
            y[x_len - i] = True
        else:
            y[x_len - i] = False
    return y


def equal_divide_indice(length, num_divide):
    x = np.linspace(0, length - 1, length)
    indice = np.ones_like(x) * -1
    for i, thresh in enumerate(np.linspace(0, length, num_divide + 1)[:-1].astype(int)):
        indice[thresh : ] = i
    return indice


def divide_with_pcs(df, num_divide, division):
    df["_eq_fold"] = equal_divide_indice(length=df.shape[0], num_divide=num_divide)
    df["fold"] = np.nan
    for i, (start, end) in enumerate(df.groupby("_eq_fold")[division].agg(["min", "max"]).values):
        indice = (start < df[division]) & (df[division] <= end)
        df.loc[indice, "fold"] = i
    df["fold"] = df["fold"].fillna(method="ffill").fillna(method="bfill")
    return df

In [8]:
def add_lag_features(df: pd.DataFrame, features: List[str], lags: List[int]) -> Tuple[pd.DataFrame, List[str]]:
    features_with_lags = [] + features
    for lag in lags:
        lag_features = [f"{f}_lag{lag}" for f in features]
        df[lag_features] = df[features].shift(lag)
        features_with_lags += lag_features
    df = df.dropna().reset_index(drop=True)
    return df, features_with_lags

In [9]:
features = ['dsharp_1', 'area_1','change_1', 'maxlen_1', 'minlen_1', 'dsharp_2', 'area_2', 'change_2', 'maxlen_2', 'minlen_2', 'spread_upper', 'spread_lower']

dfa = add_features(df=df)
dfa, features = add_lag_features(df=dfa, features=features, lags=[1, 2, 3])

train = divide_with_pcs(df=dfa, num_divide=5, division="_pcs_2")
train = train[train.columns[~train.columns.str.startswith("_")]]

In [10]:
class Market(object):    
    def __init__(self, df: pd.DataFrame, features: List[str], is_eval: bool = False):
        self.market_states_cols = features
        self.market_states = df[self.market_states_cols].values
        self.is_eval = is_eval
        self.prices = df[["price", "max_price", "min_price", "buy_price", "sell_price"]].values
        self.i = 0
        
        self.ob, self.os = False, False
        self.fb, self.fs = False, False
        self.lb, self.ls = None, None
        self.step_from_ob, self.step_from_os = 0, 0
        self.step_from_fb, self.step_from_fs = 0, 0
        
        self.sum_rtn = 0
        self.rtns = list()
        self.cur_rtn_sum = 0
        self.prices_when_fill = deque()
        self.position_side = None
        
        self.is_transaction_end = False
    
    @property
    def num_steps(self) -> int:
        return self.prices.shape[0] - 2
            
    def step(self, logits: np.ndarray) -> Tuple[float, bool]:        
        def sample_categorical(logits: np.ndarray) -> str:
            logits = logits - logits.max()
            probs = np.exp(logits) / np.exp(logits).sum()
            return np.random.choice(["Hold", "Buy", "Sell", "Cancel"], p=probs)
        
        spread = logits[0]
        if self.is_eval:
            action = ["Hold", "Buy", "Sell", "Cancel"][logits[1:].argmax()]
        else:
            action = sample_categorical(logits=logits[1:])
        
        market_state = self.market_states[self.i]
        price, _, _ = self.prices[self.i, [0, 3, 4]]
        sthprice, bthprice = self.prices[self.i + 1, [1, 2]]
        
        if action == "Buy":
            if self.fb:
                pass
            else:
                if self.os:
                    self.unset_order_sell()
                self.set_order_buy(price=price * (1 - spread))
        elif action == "Sell":
            if self.fs:
                pass
            else:
                if self.ob:
                    self.unset_order_buy()
                self.set_order_sell(price=price * (1 + spread))
        elif action == "Hold":
            pass
        elif action == "Cancel":
            if self.ob:
                self.unset_order_buy()
            elif self.os:
                self.unset_order_sell()
        
        if self.ob:
            self.step_from_ob += 1
            if self.lb > bthprice:
                self.fb = True
                self.ob = False
                self.step_from_ob = 0
                self.position_side = "Buy"
        
        if self.os:
            self.step_from_os += 1
            if self.ls < sthprice:
                self.fs = True
                self.os = False
                self.step_from_os = 0
                self.position_side = "Sell"
        
        if self.fb:
            self.step_from_fb += 1
        
        if self.fs:
            self.step_from_fs += 1
            
        if (self.lb is not None) and (self.ls is not None):
            cur_rtn = self.ls / self.lb - 1
            self.append_sharp_ration_diff()
        elif self.lb is not None:
            cur_rtn = price / self.lb - 1
            self.append_sharp_ration_diff()
        elif self.ls is not None:
            cur_rtn = self.ls / price - 1
            self.append_sharp_ration_diff()
        else:
            cur_rtn = 0
        self.cur_rtn_sum += cur_rtn
        sharp_ratio = self.calc_sharp_ratio()
        
        if self.fb and self.fs:
            self.step_from_fb, self.step_from_fs = 0, 0
            rtn = self.ls / self.lb - 1
            self.fb, self.fs = False, False
            self.sum_rtn += rtn
            self.cur_rtn_sum = 0
            self.prices_when_fill = deque()
            self.position_side = None
            self.is_transaction_end = True
        else:
            rtn = 0
        
        self.rtns.append({
            "i": self.i, "rtn": self.sum_rtn, "cur_rtn": cur_rtn,
            "ob": self.ob, "os": self.os, "fb": self.fb, "fs": self.fs,
            "act": action, "spread": spread
        })
        self.i += 1
        return sharp_ratio, self.is_transaction_end
    
    def append_sharp_ration_diff(self):
        if len(self.prices_when_fill) == 0:
            self.prices_when_fill.append(0)
        else:
            if self.position_side == "Buy":
                self.prices_when_fill.append(self.prices[self.i, 0] / self.prices[self.i - 1, 0] - 1)
            elif self.position_side == "Sell":
                self.prices_when_fill.append(self.prices[self.i - 1, 0] / self.prices[self.i, 0] - 1)
    
    def calc_sharp_ratio(self) -> float:
        if len(self.prices_when_fill) > 2:
            rtn = np.mean(self.prices_when_fill)
            sigma = np.std(self.prices_when_fill)

            if sigma == 0:
                return 0
            else:
                return rtn / sigma - 1
        else:
            return 0
    
    def set_order_buy(self, price):
        self.ob = True
        self.lb = price
        self.step_from_ob = 0
    
    def set_order_sell(self, price):
        self.os = True
        self.ls = price
        self.step_from_os = 0
    
    def unset_order_buy(self):
        self.ob = False
        self.lb = None
        self.step_from_ob = 0
    
    def unset_order_sell(self):
        self.os = False
        self.ls = None
        self.step_from_os = 0
        
    def state(self) -> np.ndarray:
        if (self.lb is not None) and (self.ls is not None):
            cur_rtn = self.ls / self.lb - 1
        elif self.lb is not None:
            cur_rtn = self.prices[self.i, 0] / self.lb - 1
        elif self.ls is not None:
            cur_rtn = self.ls / self.prices[self.i, 0] - 1
        else:
            cur_rtn = 0
        trade_state = np.array([
            int(self.ob), int(self.os), int(self.fb), int(self.fs), 
            cur_rtn,
            np.log1p(self.step_from_ob), np.log1p(self.step_from_os), np.log1p(self.step_from_fb), np.log1p(self.step_from_fs),
        ])
        market_state = self.market_states[self.i]
        return np.hstack([trade_state, market_state])
    
    def get_return(self) -> pd.DataFrame:
        return pd.DataFrame(self.rtns)
    

def random_market(df: pd.DataFrame, features: List[str], num_steps: int, is_eval: bool = False):
    idx = np.random.randint(df.shape[0] - 2 - num_steps)
    return Market(df=df.iloc[idx : idx + num_steps].reset_index(drop=True), features=features, is_eval=is_eval)

In [11]:
def plot(x: List[float], filepath: Path, xlabel: str = "", ylabel: str = ""):
    fig, ax = plt.subplots(1, 1, figsize=(12, 8))
    ax.plot(x)
    ax.set_ylabel(ylabel)
    ax.set_xlabel(xlabel)
    fig.savefig(filepath)
    plt.close(fig)

### Training

In [12]:
ppo_params = {
    "GAMMA": 0.999,
    "EPS_CLIP": 0.2,
    "K_EPOCHS": 80,
    "LR_ACTOR": 0.0003,
    "LR_CRITIC": 0.001,
    "SPREAD": 0.002
}

In [13]:
actions = np.array(["Hold", "Buy", "Sell", "Cancel"])
n_actions = 1 + actions.shape[0]
state_dim = 9 + len(features)

In [14]:
df_train, df_eval = train.loc[train["fold"] != 4].reset_index(drop=True), train.loc[train["fold"] == 4].reset_index(drop=True)
df_train.shape, df_eval.shape

((131193, 55), (32779, 55))

In [15]:
NUM_EPISODES = 10000
NUM_STEPS = 12 * 24 * 1 #1ヶ月

# TRAIN_LOG_INTERVAL = 1
EVAL_LOG_INTERVAL = 25

UPDATE_INTERVAL = 5

savedir = cachedir / "task"
savedir.mkdir(exist_ok=True, parents=True)

In [16]:
def _run_episode(agent: PPO, market: Market, device: torch.device) -> Tuple[ReplayMemory, deque]:
    memory = ReplayMemory()
    rewards = deque()
    for t in range(market.num_steps):
        state = torch.tensor(market.state(), device=device, dtype=torch.float32).view(1, -1)
        
        action, action_logprob = agent.select_action(state=state)
        
        reward, is_end = market.step(logits=action.detach().numpy().squeeze())
        rewards.append(reward)
        reward = torch.tensor([reward], device=device, dtype=torch.float32)
        
        memory.push(state=state, action=action, reward=reward, logprob=action_logprob, is_end=is_end)
        
        if is_end:
            break
    return memory, np.mean(rewards)

def run_episode(args):
    return _run_episode(**args)

In [None]:
agent = PPO(state_dim=state_dim, action_dim=n_actions, device=device, params=ppo_params)

num_async = multiprocessing.cpu_count() - 1
pool = multiprocessing.Pool(num_async)

log = list()
best_reward = -np.inf
episode_durations = []
for i_episode in tqdm(range(NUM_EPISODES)):
    params = list()
    for _ in range(num_async):
        params.append({
            "agent": agent, 
            "market": random_market(df=df_train, features=features, num_steps=NUM_STEPS),
            "device": device,
        })
    result = pool.map(run_episode, params)
    memories, rewards = [r[0] for r in result], [r[1] for r in result]
    
    episode_durations += rewards
    train_reward = np.mean(rewards)
    agent.merge_memories(memories=memories)
    
    del memories, rewards
    gc.collect()
        
    if (i_episode + 1) % UPDATE_INTERVAL == 0:
        agent.update()
    
    plot(x=episode_durations, filepath=savedir / "episode_durations.png", xlabel="episode", ylabel="avg reward")
    
    # train_returns = market.get_return()["rtn"].values
    # if (i_episode + 1) % TRAIN_LOG_INTERVAL == 0:
    #     plot(x=train_returns, filepath=savedir / f"train_returns_{i_episode + 1}.png", xlabel="steps", ylabel="return")
    #     plot(x=rewards, filepath=savedir / f"train_reward_{i_episode + 1}.png", xlabel="steps", ylabel="reward")
    #     market.get_return().to_csv(savedir / f"train_log_{i_episode + 1}.csv", index=False)
    #     agent.save_model(filepath=savedir / f"model_{i_episode + 1}.pt")

    if (i_episode + 1) % EVAL_LOG_INTERVAL == 0:
        market = random_market(df=df_eval, features=features, num_steps=NUM_STEPS, is_eval=True)
        
        rewards = list()
        for t in range(market.num_steps):
            state = torch.tensor(market.state(), device=device, dtype=torch.float32).view(1, -1)
            
            action = agent.select_deterministical_action(state=state)
            
            reward, is_end = market.step(logits=action.detach().numpy().squeeze())
            rewards.append(reward)
        
        eval_returns = market.get_return()["rtn"].values
        plot(x=eval_returns, filepath=savedir / f"eval_returns_{i_episode + 1}.png", xlabel="steps", ylabel="return")
        plot(x=rewards, filepath=savedir / f"eval_reward_{i_episode + 1}.png", xlabel="steps", ylabel="reward")
        market.get_return().to_csv(savedir / f"eval_log_{i_episode + 1}.csv", index=False)
        
        eval_reward = np.mean(rewards)
        if eval_reward > best_reward:
            plot(x=rewards, filepath=savedir / "eval_reward_best.png", xlabel="steps", ylabel="reward")
            plot(x=eval_returns, filepath=savedir / "eval_returns_best.png", xlabel="steps", ylabel="return")
            agent.save_model(filepath=savedir / "model_best.pt")
            best_reward = eval_reward
        
        del market
        gc.collect()
    else:
        eval_returns = [np.nan]
        eval_reward = np.nan
    
    log.append({
        "episode": i_episode + 1, "train_reward": train_reward, "train_return": None,
        "eval_reward": eval_reward, "eval_return": eval_returns[-1]
    })
    gc.collect()

del agent, pool
gc.collect()

  8%|███████                                                                                   | 778/10000 [25:54<11:38:12,  4.54s/it]