In [1]:
import logging
# logging
logger = log = logging.getLogger(__name__)
log.setLevel(logging.INFO)
logging.basicConfig()
log.info('%s logger started.', __name__)

INFO:__main__:__main__ logger started.


In [2]:
import os
import time
import gym
from gym import spaces
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# plotting
%matplotlib notebook

In [3]:
start = time.time()

In [4]:
import torch
import torch.nn.functional as F
from torch import nn
from torch import Tensor
from PIL import Image
from torchvision.transforms import Compose, Resize, ToTensor
from einops import rearrange, reduce, repeat
from einops.layers.torch import Rearrange, Reduce

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# params
window_length = 59
steps = 128

In [6]:
# save dir
save_dir = "./outputs/pytorch-DDPG/"
os.makedirs(save_dir, exist_ok=True)

# Environment Definition

In [7]:
# from rl_portfolio_management.environments.portfolio import PortfolioEnv
from rl_portfolio_management.callbacks.notebook_plot import LivePlotNotebook
from rl_portfolio_management.config import eps

In [8]:
from rl_portfolio_management.util import sharpe

class DataSrc(object):
    """Acts as data provider for each new episode."""

    def __init__(self, 
                 df,
                 steps=252, 
                 scale=True, scale_extra_cols=True, augment=0.00, window_length=50, random_reset=True):
        """
        DataSrc.

        df - csv for data frame index of timestamps
             and multi-index columns levels=[['LTCBTC'],...],['open','low','high','close',...]]
             an example is included as an hdf file in this repository
        steps - total steps in episode
        scale - scale the data for each episode
        scale_extra_cols - scale extra columns by global mean and std
        augment - fraction to augment the data by
        random_reset - reset to a random time (otherwise continue through time)
        """
        self.steps = steps + 1
        self.augment = augment
        self.random_reset = random_reset
        self.scale = scale
        self.scale_extra_cols = scale_extra_cols
        self.window_length = window_length
        self.idx = self.window_length
        


        # get rid of NaN's
        df = df.copy()
        df.replace(np.nan, 0, inplace=True)
        df = df.fillna(method="pad")

        # dataframe to matrix
        self.asset_names = df.columns.levels[0].tolist()
        self.features = df.columns.levels[1].tolist()
        data = df.values.reshape(
            (len(df), len(self.asset_names), len(self.features)))
        self._data = np.transpose(data, (1, 0, 2))
        self._times = df.index

        self.price_columns = ['close', 'high', 'low']
#         self.price_columns = ['close', 'high', 'low', 'open']
        self.non_price_columns = set(
            df.columns.levels[1]) - set(self.price_columns)

        # Stats to let us normalize non price columns
        if scale_extra_cols:
            x = self._data.reshape((-1, len(self.features)))
            self.stats = dict(mean=x.mean(0), std=x.std(0))
            # for column in self._data.columns.levels[1].tolist():
            #     x = df.xs(key=column, axis=1, level='Price').as_matrix()[:, :]
            #     self.stats["mean"].append(x.mean())
            #      = dict(mean=x.mean(), std=x.std())

        self.reset()

    def _step(self):
        # get history matrix from dataframe
        data_window = self.data[:, self.step:self.step +
                                self.window_length].copy()
        # (eq.1) prices
        y1 = data_window[:, -1, 0] / data_window[:, -2, 0]
        y1 = np.concatenate([[1.0], y1])  # add cash price

        # (eq 18) X: prices are divided by close price
        nb_pc = len(self.price_columns)
        if self.scale:
            last_close_price = data_window[:, -1, 0]
            data_window[:, :, :nb_pc] /= last_close_price[:,
                                                          np.newaxis, np.newaxis]

        if self.scale_extra_cols:
            # normalize non price columns
            data_window[:, :, nb_pc:] -= self.stats["mean"][None, None, nb_pc:]
            data_window[:, :, nb_pc:] /= self.stats["std"][None, None, nb_pc:]
            data_window[:, :, nb_pc:] = np.clip(
                data_window[:, :, nb_pc:],
                self.stats["mean"][nb_pc:] - self.stats["std"][nb_pc:] * 10,
                self.stats["mean"][nb_pc:] + self.stats["std"][nb_pc:] * 10
            )
            data_window[np.isinf(data_window)] = 0
            data_window[np.isnan(data_window)] = 0

        self.step += 1
        history = data_window
        done = bool(self.step >= self.steps)

        return history, y1, done

    def reset(self):
        self.step = 0

        # get data for this episode
        if self.random_reset:
            self.idx = np.random.randint(
                low=self.window_length + 1, high=self._data.shape[1] - self.steps - 2)
        else:
            # continue sequentially, before reseting to start
            if self.idx>(self._data.shape[1] - self.steps - self.window_length - 1):
                self.idx=self.window_length + 1
            else:
                self.idx += self.steps
        data = self._data[:, self.idx -
                          self.window_length:self.idx + self.steps + 1].copy()
        self.times = self._times[self.idx -
                                 self.window_length:self.idx + self.steps + 1]

        # augment data to prevent overfitting
        data += np.random.normal(loc=0, scale=self.augment, size=data.shape)

        self.data = data


class PortfolioSim(object):
    """
    Portfolio management sim.

    Params:
    - cost e.g. 0.0025 is max in Poliniex

    Based of [Jiang 2017](https://arxiv.org/abs/1706.10059)
    """

    def __init__(self, asset_names=[], steps=128, trading_cost=0.0025, time_cost=0.0):
        self.cost = trading_cost
        self.time_cost = time_cost
        self.steps = steps
        self.asset_names = asset_names
        self.reset()

    def _step(self, w1, y1):
        """
        Step.

        w1 - new action of portfolio weights - e.g. [0.1,0.9, 0.0]
        y1 - price relative vector also called return
            e.g. [1.0, 0.9, 1.1]
        Numbered equations are from https://arxiv.org/abs/1706.10059
        """
        w0 = self.w0
        p0 = self.p0
        #
        # print("y1:",y1)
        # print("w1",w1)
        dw1 = (y1 * w0) / (np.dot(y1, w0) + eps)  # (eq7) weights evolve into
        # print("w0:",w0)
        # (eq16) cost to change portfolio
        # (excluding change in cash to avoid double counting for transaction cost)
        c1 = self.cost * (
            np.abs(dw1[1:] - w1[1:])).sum()

        p1 = p0 * (1 - c1) * np.exp(np.dot(np.log(y1), w0))  # (eq11) final portfolio value

        p1 = p1 * (1 - self.time_cost)  # we can add a cost to holding

        # can't have negative holdings in this model (no shorts)
        # p1 = np.clip(p1, 0, np.inf)

        rho1 = p1 / p0 - 1  # rate of returns
        r1 = np.log((p1 + eps) / (p0 + eps))  # (eq10) log rate of return
        # r1 = np.log(1 - c1) + np.dot(np.log(y1), w0)  # (eq10) log rate of return
        # (eq22) immediate reward is log rate of return scaled by episode length

        self.ret.append(r1) # 把收益率保存在容器中

        reward = sharpe(np.array(self.ret)) / self.steps
        # reward = r1 / self.steps
        # reward = np.power(r1, 1/self.steps)

        # remember for next step
        self.w0 = w1
        self.p0 = p1


        # if we run out of money, we're done
        done = bool(p1 == 0)

        # should only return single values, not list
        info = {
            "reward": reward,
            "log_return": r1,
            "portfolio_value": p1,
            "market_return": y1.mean(),
            "rate_of_return": rho1,
            "weights_mean": w1.mean(),
            "weights_std": w1.std(),
            "cost": c1,
        }
        # record weights and prices
        for i, name in enumerate(['CASH'] + self.asset_names):
            info['weight_' + name] = w1[i]
            info['price_' + name] = y1[i]

        self.infos.append(info)
        return reward, info, done

    def reset(self):
        self.infos = []
        self.w0 = np.array([1.0] + [0.0] * len(self.asset_names))
        self.p0 = 1.0
        self.ret = [] # 定义一个容器，保存收益率，用作sharpe比率计算


class PortfolioEnv(gym.Env):
    """
    An environment for financial portfolio management.

    Financial portfolio management is the process of constant redistribution of a fund into different
    financial products.

    Based on [Huang 2020](https://arxiv.org/abs/2012.13773)
    """
    # Because of google colab, we cannot implement the GUI ('human' render mode)
    metadata = {'render.modes': ['notebook', 'ansi']}

    
    def __init__(self,
                 df,
                 steps=256,
                 trading_cost=0.0025,
                 time_cost=0.00,
                 window_length=window_length,
                 augment=0.00,
                 output_mode='EIIE',
                 log_dir=None,
                 scale=True,
                 scale_extra_cols=True,
                 random_reset=True
                 ):
        """
        An environment for financial portfolio management.

        Params:
            df - csv for data frame index of timestamps
                 and multi-index columns levels=[['LTCBTC'],...],['open','low','high','close']]
            steps - steps in episode
            window_length - how many past observations["history"] to return
            trading_cost - cost of trade as a fraction,  e.g. 0.0025 corresponding to max rate of 0.25% at Poloniex (2017)
            time_cost - cost of holding as a fraction
            augment - fraction to randomly shift data by
            output_mode: decides observation["history"] shape
            - 'EIIE' for (assets, window, 3)
            - 'atari' for (window, window, 3) (assets is padded)
            - 'mlp' for (assets*window*3)
            log_dir: directory to save plots to
            scale - scales price data by last opening price on each episode (except return)
            scale_extra_cols - scales non price data using mean and std for whole dataset
        """
        super(PortfolioEnv, self).__init__()
        self.src = DataSrc(df=df, steps=steps, scale=scale, scale_extra_cols=scale_extra_cols,
                           augment=augment, window_length=window_length,
                           random_reset=random_reset)
        self._plot = self._plot2 = self._plot3 = None
        self.output_mode = output_mode
        self.sim = PortfolioSim(
            asset_names=self.src.asset_names,
            trading_cost=trading_cost,
            time_cost=time_cost,
            steps=steps)
        self.log_dir = log_dir

        # openai gym attributes
        # action will be the portfolio weights [cash_bias,w1,w2...] where wn are [-1, 1] for each asset
        nb_assets = len(self.src.asset_names)
        self.action_space = gym.spaces.Box(0, 1.0, shape=(nb_assets + 1,), dtype=np.float32)
        
        # get the history space from the data min and max
        if output_mode == 'EIIE':
            obs_shape = (
                nb_assets,
                window_length,
                len(self.src.features)
            )
            
        self.observation_space = gym.spaces.Dict({
            'history': gym.spaces.Box(
                -10,
                20 if scale else 1,  # if scale=True observed price changes return could be large fractions
                obs_shape
            ),
            'weights': self.action_space    
        })
        
    def reset(self):
        self.sim.reset()
        self.src.reset()
        self.infos = []
        action = self.sim.w0
        observation, reward, done, info = self.step(action)
        return observation
    
    def step(self, action):
        """
        Step the env.

        Actions should be portfolio [w0...]
        - Where wn is a portfolio weight between 0 and 1. The first (w0) is cash_bias
        - cn is the portfolio conversion weights see PortioSim._step for description
        """
        logger.debug('action: %s', action)
#         weights = np.clip(action, -1.0, 1.0)
#         weights[0] = np.clip(weights[0], 0, 1)
#         weights_abs = np.abs(weights)
#         weights /= weights_abs.sum() + eps
        # print('weights:',weights)
        # weights = action
        weights = action
        
#         weights /= weights.sum() 


        # Sanity checks
        assert self.action_space.contains(
            action), 'action should be within %r but is %r' % (self.action_space, action)
        np.testing.assert_almost_equal(
            np.sum(weights), 1.0, 3, err_msg='weights should sum to 1. action="%s"' % weights)

        history, y1, done1 = self.src._step()

        reward, info, done2 = self.sim._step(weights, y1)

        # calculate return for buy and hold a bit of each asset
        info['market_value'] = np.cumprod(
            [inf["market_return"] for inf in self.infos + [info]])[-1]
        # add dates
        info['date'] = self.src.times[self.src.step].timestamp()
        info['steps'] = self.src.step

        self.infos.append(info)

        # reshape history according to output mode
        if self.output_mode == 'EIIE':
            pass
        elif self.output_mode == 'atari':
            padding = history.shape[1] - history.shape[0]
            history = np.pad(history, [[0, padding], [
                0, 0], [0, 0]], mode='constant')
        elif self.output_mode == 'mlp':
            history = history.flatten()

        return {'history': history, 'weights': weights}, reward, done1 or done2, info
    
    def _seed(self, seed):
        np.random.seed(seed)
        return [seed]

    def render(self, mode='notebook', close=False):
        # if close:
            # return
        if mode == 'ansi':
            pprint(self.infos[-1])
        elif mode == 'notebook':
            self.plot_notebook(close)

    def plot_notebook(self, close=False):
        """Live plot using the jupyter notebook rendering of matplotlib."""

        if close:
            self._plot = self._plot2 = self._plot3 = None
            return

        df_info = pd.DataFrame(self.infos)
        df_info.index = pd.to_datetime(df_info["date"], unit='s')

        # plot prices and performance
        all_assets = ['CASH'] + self.sim.asset_names
        if not self._plot:
            colors = [None] * len(all_assets) + ['black']
            self._plot_dir = os.path.join(
                self.log_dir, 'notebook_plot_prices_' + str(time.time())) if self.log_dir else None
            self._plot = LivePlotNotebook(
                log_dir=self._plot_dir, title='prices & performance', labels=all_assets + ["Portfolio"], ylabel='value', colors=colors)
        x = df_info.index
        y_portfolio = df_info["portfolio_value"]
        y_assets = [df_info['price_' + name].cumprod()
                    for name in all_assets]
        self._plot.update(x, y_assets + [y_portfolio])


        # plot portfolio weights
        if not self._plot2:
            self._plot_dir2 = os.path.join(
                self.log_dir, 'notebook_plot_weights_' + str(time.time())) if self.log_dir else None
            self._plot2 = LivePlotNotebook(
                log_dir=self._plot_dir2, labels=all_assets, title='weights', ylabel='weight')
        ys = [df_info['weight_' + name] for name in all_assets]
        self._plot2.update(x, ys)

        # plot portfolio costs
        if not self._plot3:
            self._plot_dir3 = os.path.join(
                self.log_dir, 'notebook_plot_cost_' + str(time.time())) if self.log_dir else None
            self._plot3 = LivePlotNotebook(
                log_dir=self._plot_dir3, labels=['cost'], title='costs', ylabel='cost')
        ys = [df_info['cost']]
        self._plot3.update(x, ys)

        if close:
            self._plot = self._plot2 = self._plot3 = None


# APPLY ENV

In [9]:
from rl_portfolio_management.util import MDD, sharpe, softmax, MDD1, sortino, calmar, other_metrics
from rl_portfolio_management.wrappers import SoftmaxActions, TransposeHistory, ConcatStates

df_train = pd.read_hdf('./data/chinaStock_1d_vol.hf',key='train')
df_test = pd.read_hdf('./data/chinaStock_1d_vol.hf',key='test')

In [10]:
import gym
class DeepRLWrapper(gym.Wrapper):
    def __init__(self, env):
        super().__init__(env)
        self.render_on_reset = False
        
        self.state_dim = self.observation_space.shape
        self.action_dim = self.action_space.shape[0]
        
        self.name = 'PortfolioEnv'
        self.success_threshold = 2
        
    def normalize_state(self, state):
        return state
    
    def step(self, action):
        state, reward, done, info =self.env.step(action)
#         reward*=1e4 # often reward scaling is important sooo...
        return state, reward, done, info
    
    def reset(self):        
        # here's a roundabout way to get it to plot on reset
        if self.render_on_reset: 
            self.env.render('notebook')

        return self.env.reset()

In [11]:
env = PortfolioEnv(df=df_train, steps=steps, output_mode='EIIE')

In [12]:
def task_fn():
    env = PortfolioEnv(df=df_train, steps=steps, output_mode='EIIE')
    env = TransposeHistory(env)
    env = ConcatStates(env)
    env = SoftmaxActions(env)
    env = DeepRLWrapper(env)
    return env

def task_fn_test(df=df_test, steps=steps):
    env = PortfolioEnv(df=df_test, steps=steps, output_mode='EIIE')
    env = TransposeHistory(env)
    env = ConcatStates(env)
    env = SoftmaxActions(env)
    env = DeepRLWrapper(env)
    return env
    
# sanity check
task = task_fn()
task.reset().shape, task.step(task.action_space.sample())[0].shape

((6, 60, 12), (6, 60, 12))

In [13]:
task.observation_space

Box([[[-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -10. ... -10. -10. -10.]
  ...
  [-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -10. ... -10. -10. -10.]]

 [[-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -10. ... -10. -10. -10.]
  ...
  [-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -10. ... -10. -10. -10.]]

 [[-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -10. ... -10. -10. -10.]
  ...
  [-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -10. ... -10. -10. -10.]]

 [[-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -10. ... -10. -10. -10.]
  ...
  [-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -10. ... -10. -10. -10.]]

 [[-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -

In [14]:
# Instantiate the env
env = task_fn()

In [15]:
env_test = task_fn_test()

In [16]:
env_test.observation_space

Box([[[-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -10. ... -10. -10. -10.]
  ...
  [-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -10. ... -10. -10. -10.]]

 [[-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -10. ... -10. -10. -10.]
  ...
  [-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -10. ... -10. -10. -10.]]

 [[-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -10. ... -10. -10. -10.]
  ...
  [-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -10. ... -10. -10. -10.]]

 [[-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -10. ... -10. -10. -10.]
  ...
  [-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -10. ... -10. -10. -10.]]

 [[-10. -10. -10. ... -10. -10. -10.]
  [-10. -10. -

In [17]:
from stable_baselines3.common.env_checker import check_env
check_env(env)

  f"It seems that your observation {key} is an image but the `dtype` "
  f"It seems that your observation space {key} is an image but the "
  "The minimal resolution for an image is 36x36 for the default `CnnPolicy`. "
  "We recommend you to use a symmetric and normalized Box action space (range=[-1, 1]) "


In [18]:
# check_env(env_test)

In [19]:
print(task.reset().shape[0])
# num_act = task.reset().shape[2] 
env.action_space.shape[-1] - 1

6


12

# VIT网络结构部分：

In [20]:
emb_size = 256
class PatchEmbedding(nn.Module):
    def __init__(self, in_channels: int = task.reset().shape[0], patch_size: int = 3, emb_size: int = emb_size):
        self.patch_size = patch_size
        super().__init__()
        self.projection = nn.Sequential(
            # using a conv layer instead of a linear one -> performance gains
            nn.Conv2d(in_channels, emb_size, kernel_size=self.patch_size, stride=self.patch_size),
            Rearrange('b e (h) (w) -> b (h w) e'),
        )
        self.h = window_length + 1
        self.w = env.action_space.shape[-1] - 1
        self.cls_token = nn.Parameter(torch.randn(1,1, emb_size))
        #位置参数，大小为 (数据高/ patch_size) * (数据宽/ patch_size) 
        self.positions = nn.Parameter(torch.randn((self.h//self.patch_size * self.w//self.patch_size  + 1, emb_size)))

                                      
    def forward(self, x: Tensor) -> Tensor:                               
        b, _, _, _ = x.shape
        x = self.projection(x)
        cls_tokens = repeat(self.cls_token, '() n e -> b n e', b=b)
        # prepend the cls token to the input
        x = torch.cat([cls_tokens, x], dim=1)
        # add position embedding
        x += self.positions
        return x

In [21]:
class MultiHeadAttention(nn.Module):
    def __init__(self, emb_size: int = emb_size, num_heads: int = 32, dropout: float = 0):
        super().__init__()
        self.emb_size = emb_size
        self.num_heads = num_heads
        self.keys = nn.Linear(emb_size, emb_size)
        self.queries = nn.Linear(emb_size, emb_size)
        self.values = nn.Linear(emb_size, emb_size)
        self.att_drop = nn.Dropout(dropout)
        self.projection = nn.Linear(emb_size, emb_size)
        
    def forward(self, x : Tensor, mask: Tensor = None) -> Tensor:
        # split keys, queries and values in num_heads
        queries = rearrange(self.queries(x), "b n (h d) -> b h n d", h=self.num_heads)
        keys = rearrange(self.keys(x), "b n (h d) -> b h n d", h=self.num_heads)
        values  = rearrange(self.values(x), "b n (h d) -> b h n d", h=self.num_heads)
        # sum up over the last axis
        energy = torch.einsum('bhqd, bhkd -> bhqk', queries, keys) # batch, num_heads, query_len, key_len
        if mask is not None:
            fill_value = torch.finfo(torch.float32).min
            energy.mask_fill(~mask, fill_value)
            
        scaling = self.emb_size ** (1/2)
        att = F.softmax(energy, dim=-1) / scaling
        att = self.att_drop(att)
        # sum up over the third axis
        out = torch.einsum('bhal, bhlv -> bhav ', att, values)
        out = rearrange(out, "b h n d -> b n (h d)")
        out = self.projection(out)
        return out


In [22]:
class ResidualAdd(nn.Module):
    def __init__(self, fn):
        super().__init__()
        self.fn = fn
        
    def forward(self, x, **kwargs):
        res = x
        x = self.fn(x, **kwargs)
        x += res
        return x

In [23]:
class FeedForwardBlock(nn.Sequential):
    def __init__(self, emb_size: int=emb_size, expansion: int = 4, drop_p: float = 0):
        super().__init__(
            nn.Linear(emb_size, expansion * emb_size),
            nn.GELU(),
#             nn.Dropout(drop_p),
            nn.Linear(expansion * emb_size, emb_size),
        )

In [24]:
class TransformerEncoderBlock(nn.Sequential):
    def __init__(self,
                 emb_size: int = emb_size,
                 drop_p: float = 0,
                 forward_expansion: int = 4,
                 forward_drop_p: float = 0,
                 ** kwargs):
        super().__init__(
            ResidualAdd(nn.Sequential(
                nn.LayerNorm(emb_size),
                MultiHeadAttention(emb_size, **kwargs),
                nn.Dropout(drop_p)
            )),
            ResidualAdd(nn.Sequential(
                nn.LayerNorm(emb_size),
                FeedForwardBlock(
                    emb_size, expansion=forward_expansion, drop_p=forward_drop_p),
                nn.Dropout(drop_p)
            )
            ))


In [25]:
class ClassificationHead(nn.Sequential):
    def __init__(self, emb_size: int = emb_size, n_classes: int = emb_size):
        n_classes = env.action_space.shape[-1] - 1
        super().__init__(
            Reduce('b n e -> b e', reduction='mean'),
            nn.LayerNorm(emb_size), 
#             nn.Linear(emb_size, n_classes)
        )

In [26]:
class ViT(nn.Module):
    def __init__(self,
                in_channels: int = task.reset().shape[0],
                patch_size: int = 3,
                emb_size: int = emb_size,
                depth: int = 6,
                n_classes: int = emb_size,
                **kwargs):
        super().__init__()
        self.layers = nn.Sequential(
            PatchEmbedding(in_channels, patch_size, emb_size),
            *(TransformerEncoderBlock(emb_size=emb_size, **kwargs) for _ in range(depth)),
            ClassificationHead(emb_size, n_classes)
        )

    def forward(self, x: Tensor) -> Tensor:
        return self.layers(x)

# sb3中自定义网络：

In [27]:
import torch as th
# import torch.nn as nn
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor

# from rl_portfolio_management.customExtractor import CustomCNN
class CustomCNN(BaseFeaturesExtractor):
    """
    :param observation_space: (gym.Space)
    :param features_dim: (int) Number of features extracted.
        This corresponds to the number of unit for the last layer.
    """

    def __init__(self, 
                 observation_space: gym.spaces.Box, 
                 features_dim: int = emb_size):
        super(CustomCNN, self).__init__(observation_space, features_dim)
        # We assume CxHxW images (channels first)
        # Re-ordering will be done by pre-preprocessing or wrapper       
        n_input_channels = observation_space.shape[0]
#         num_act = observation_space.shape[2]

        self.vit = ViT()
        # Compute shape by doing one forward pass
#         with th.no_grad():
#             n_flatten = self.cnn(
#                 th.as_tensor(observation_space.sample()[None]).float()
#             ).shape[1]
            
        with th.no_grad():
            n_flatten = self.vit(
                th.as_tensor(observation_space.sample()[None]).float()
            ).shape[1]
        
        self.linear = nn.Sequential(nn.Linear(n_flatten, features_dim, bias=True), nn.ReLU())

    def forward(self, observations: th.Tensor) -> th.Tensor:

        x=self.vit(observations)
               
        return self.linear(x)

In [28]:
from typing import Callable, Dict, List, Optional, Tuple, Type, Union
from stable_baselines3.common.policies import ActorCriticPolicy
from stable_baselines3.common.policies import BaseModel, BasePolicy, register_policy

# from stable_baselines3.common.policies import BaseModel, BasePolicy, create_sde_features_extractor, register_policy
# from stable_baselines3.td3.policies import Actor, TD3Policy
# from stable_baselines3.common.preprocessing import get_action_dim

In [29]:
# class CustomNetwork(nn.Module):
#     """
#     Custom network for policy and value function.
#     It receives as input the features extracted by the feature extractor.

#     :param feature_dim: dimension of the features extracted with the features_extractor (e.g. features from a CNN)
#     :param last_layer_dim_pi: (int) number of units for the last layer of the policy network
#     :param last_layer_dim_vf: (int) number of units for the last layer of the value network
#     """

#     def __init__(
#         self,
#         feature_dim: int,
#         last_layer_dim_pi: int = 64,
#         last_layer_dim_vf: int = 64,
#     ):
#         super(CustomNetwork, self).__init__()
        
#         # IMPORTANT:
#         # Save output dimensions, used to create the distributions
#         self.latent_dim_pi = last_layer_dim_pi
#         self.latent_dim_vf = last_layer_dim_vf
#         dropout_half = nn.Dropout(p=0.5)

#         # Policy network
#         self.policy_net = nn.Sequential(
#             nn.Linear(feature_dim, 256), 
#             nn.ReLU(),
#             dropout_half,
#             nn.Linear(256, 128), 
#             nn.ReLU(),
#             dropout_half,
#             nn.Linear(128, last_layer_dim_pi), 
#         )
#         # Value network
#         self.value_net = nn.Sequential(
#             nn.Linear(feature_dim, 128), 
#             nn.ReLU(),
#             dropout_half,
#             nn.Linear(128, 64), 
#             nn.ReLU(),
#             dropout_half,
#             nn.Linear(64, last_layer_dim_vf), 
#         )

#     def forward(self, features: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
#         """
#         :return: (th.Tensor, th.Tensor) latent_policy, latent_value of the specified network.
#             If all layers are shared, then ``latent_policy == latent_value``
#         """
#         return self.policy_net(features), self.value_net(features)


# class CustomActorCriticPolicy(ActorCriticPolicy):
#     def __init__(
#         self,
#         observation_space: gym.spaces.Space,
#         action_space: gym.spaces.Space,
#         lr_schedule: Callable[[float], float],
#         net_arch: Optional[List[Union[int, Dict[str, List[int]]]]] = None,
#         activation_fn: Type[nn.Module] = nn.Tanh,
#         *args,
#         **kwargs,
#     ):

#         super(CustomActorCriticPolicy, self).__init__(
#             observation_space,
#             action_space,
#             lr_schedule,
#             net_arch,
#             activation_fn,
#             # Pass remaining arguments to base class
#             *args,
#             **kwargs,
#         )
#         # Disable orthogonal initialization
#         self.ortho_init = False

#     def _build_mlp_extractor(self) -> None:
#         self.mlp_extractor = CustomNetwork(self.features_dim,
#                                            last_layer_dim_pi = env.action_space.shape[-1],
#                                            last_layer_dim_vf = 1,
#                                           )
        

In [30]:
from stable_baselines3 import SAC
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise

# The noise objects for DDPG
n_actions = env.action_space.shape[-1]
action_noise = NormalActionNoise(mean=np.ones(n_actions)*0, sigma=0.27*np.ones(n_actions))
# lr_schedule = [[4e-5],5e-4]

policy_kwargs = dict(
    features_extractor_class=CustomCNN,
    features_extractor_kwargs=dict(features_dim=emb_size),
#     net_arch=[dict(pi=[256],vf=[256])],
)

In [31]:
# model = PPO('MlpPolicy', env, policy_kwargs=policy_kwargs,
#             tensorboard_log="./runs/ppo-vit1-softmax",
# #             learning_rate=2e-4,
#             batch_size=256,
#             verbose=1)

In [32]:
model = SAC("MlpPolicy", env, policy_kwargs=policy_kwargs,
            tensorboard_log="./runs/SAC-vit-softmax-maxSharpe",
#             learning_rate=3e-4,
            batch_size=256,
            action_noise=action_noise, 
            verbose=1
           )

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [33]:
# model = TD3(CustomTD3Policy, env, policy_kwargs=policy_kwargs, verbose=1)

In [34]:
model.learn(4500000) 

Logging to ./runs/SAC-vit-softmax-maxSharpe/SAC_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 128      |
|    ep_rew_mean     | -0.947   |
| time/              |          |
|    episodes        | 4        |
|    fps             | 3        |
|    time_elapsed    | 164      |
|    total_timesteps | 512      |
| train/             |          |
|    actor_loss      | -25.2    |
|    critic_loss     | 2.46     |
|    ent_coef        | 0.884    |
|    ent_coef_loss   | -2.68    |
|    learning_rate   | 0.0003   |
|    n_updates       | 411      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 128      |
|    ep_rew_mean     | -0.0556  |
| time/              |          |
|    episodes        | 8        |
|    fps             | 2        |
|    time_elapsed    | 367      |
|    total_timesteps | 1024     |
| train/             |          |
|    actor_loss      | -35.7    

KeyboardInterrupt: 

In [None]:
modellog_dir = "./modelsave/"

In [None]:
model.save(modellog_dir + "sac-vit1-softmax-maxSharpe-patch3x3")

In [None]:
# Load the agent
model_loaded = SAC.load(modellog_dir + "sac-vit1-softmax-maxSharpe-patch3x3", env=env_test)

In [None]:
env_test.observation_space

In [None]:
obs_test = env_test.reset()

In [None]:
obs_test.shape

In [None]:
for i in range(steps):
    action, _states = model_loaded.predict(obs_test)
    obs, rewards, dones, info = env_test.step(action)
#     env_test.render('notebook')

In [None]:
df=pd.DataFrame(env_test.infos)
df.index = pd.to_datetime(df['date']*1e9)

In [None]:
df

In [None]:
s=sharpe(df.log_return)
mdd=MDD(df.portfolio_value)
mdd1=MDD1(df.portfolio_value)
s1=sharpe(df.rate_of_return)
sor_log = sortino(df.log_return)
sor_simpe = sortino(df.rate_of_return)
calmar_log = calmar(df.log_return.values)

# print('回测期日平均简单收益率(average simple return):                 \t{: 2.6f}'.format( df.rate_of_return.mean()))
print('回测期日平均对数收益率(average log return):                 \t{: 2.6f}'.format(df.log_return.mean()))

# print('简单收益率的SR (Sharpe ratio):                 \t{: 2.6f}'.format( s1))
print('对数收益率的SR (Sharpe ratio):                 \t{: 2.6f}'.format( s))

# print('简单收益率的sortino比率 (sortino ratio):                 \t{: 2.6f}'.format(sor_simpe))
print('对数收益率的sortino比率 (sortino ratio):                 \t{: 2.6f}'.format(sor_log))

print('最大回撤MDD (max drawdown):                \t{: 2.6%}'.format( mdd))
print('最大回撤MDD1 (max drawdown):                \t{: 2.6%}'.format( mdd1))

print('calmar ratio (log):                \t', calmar_log)

other_metrics(df.log_return.values)

In [None]:
df.to_csv('./resultsave/df_PPO_vit1_softmax_maxSharpe-patch3x3.csv')

In [None]:
df['portfolio_value'].plot()

In [None]:
end = time.time()

In [None]:
runTime = (end - start) // 60   # runTime是多少分钟

In [None]:
print("程序运行时间为：", runTime, "分钟")