# TD method on asset allocation

$Q^*_t(W_t,x_t)=-\frac{K^{T-t-1}}{c}[p\cdot e^{-c(a-r)(1+r)^{T-t-1}x_t}+(1-p)\cdot e^{-c(b-r)(1+r)^{T-t-1}x_t}]\cdot e^{-c(1+r)^{T-t}W_t}$

$K = p\cdot e^{-\frac{a-r}{a-b}\ln \frac{(a-r)p}{(r-b)(1-p)}}+(1-p)\cdot e^{-\frac{b-r}{a-b}\ln \frac{(a-r)p}{(r-b)(1-p)}}$

$x_t^*=\frac{1}{c(a-b)(1+r)^{T-t-1}}\cdot \ln \frac{(a-r)p}{(r-b)(1-p)}$

**Critic:**

Activation function: $-e^{-S}$

Feature functions:

$\phi_1((t,W_t,x_t)) = 1$

$\phi_2((t,W_t,x_t)) = T-t$

$\phi_3((t,W_t,x_t)) = (1+r)^{T-t}W_t$

$\phi_4((t,W_t,x_t)) = (1+r)^{T-t}x_t$


**Actor:**

Feature functions:

$\phi_1((t,W_t,x_t)) = 1$

$T = 10,\ a = 0.18,\ b = 0.02,\ r = 0.10,\ p = \frac{2}{3},\ c = 1,\ W_0 = 1$

$Q^*_t(W_t,x_t)=-K^{T-t-1}[\frac{2}{3}\cdot e^{-0.08\cdot 1.1^{T-t-1}x_t}+\frac{1}{3}\cdot e^{0.08\cdot 1.1^{T-t-1}x_t}]\cdot e^{-1.1^{T-t}W_t}$

$K = \frac{2}{3}\cdot 2^{-\frac{1}{2}}+\frac{1}{3}\cdot 2^{\frac{1}{2}} \approx 0.942809$

$x_t^*=\frac{\ln 2}{0.16\cdot 1.1^{T-t-1}} \approx \frac{4.33217}{1.1^{T-t-1}}$

In [1]:
import torch
import numpy as np
import torch.nn as nn

seed = 1
torch.manual_seed(seed)
np.random.seed(seed)
torch.set_default_dtype(torch.float)

In [2]:
class Actor(nn.Module):
    def __init__(self, action_state_dim, action_dim):
        # 1 action state dim, 1 action dim
        super(Actor, self).__init__()
        
        # layer
        self.layer_1 = nn.Linear(action_state_dim, action_dim, bias=False)
        nn.init.constant_(self.layer_1.weight, 3)

    def forward(self, s):
        a = self.layer_1(s)
        return a

In [3]:
class Critic(nn.Module):

    def __init__(self, state_dim, action_dim):
        # 2 state dim, 1 action dim
        super(Critic, self).__init__()
        n_layer = 2
        # layer
        dim = state_dim + action_dim
        self.layer_1 = nn.Linear(dim, n_layer, bias=True)
        nn.init.normal_(self.layer_1.weight, 0., 0.1)
        
        self.output = nn.Linear(n_layer, 1, bias=False)
        nn.init.normal_(self.layer_1.weight, 0.5, 0.1)

    def forward(self, s, a):
        
        s_a = torch.cat([s,a], dim=1)
        s_a = self.layer_1(s_a)
        s_a = -torch.exp(-s_a)
        q_val = self.output(s_a)
        return q_val

In [13]:
aaa = torch.FloatTensor([1,2,3,4,5])
aaa[-3:-1]

tensor([3., 4.])

In [14]:
class DDPG(object):
    def __init__(self, state_dim, action_state_dim, action_dim, replacement,memory_capacity=1000,gamma=1.0,lr_a=0.001, lr_c=0.002,batch_size=32) :
        super(DDPG, self).__init__()
        self.state_dim = state_dim
        self.action_state_dim = action_state_dim
        self.action_dim = action_dim
        self.memory_capacity = memory_capacity
        self.replacement = replacement
        self.t_replace_counter = 0
        self.gamma = gamma
        self.lr_a = lr_a
        self.lr_c = lr_c
        self.batch_size = batch_size

        # 记忆库
        self.memory = np.zeros((memory_capacity, (state_dim + action_state_dim) * 2 + action_dim + 1)) # (s,a,r,s')
        self.pointer = 0
        # 定义 Actor 网络
        self.actor = Actor(action_state_dim, action_dim)
        self.actor_target = Actor(action_state_dim, action_dim)
        # 定义 Critic 网络
        self.critic = Critic(state_dim,action_dim)
        self.critic_target = Critic(state_dim,action_dim)
        # 定义优化器
        self.aopt = torch.optim.Adam(self.actor.parameters(), lr=lr_a)
        self.copt = torch.optim.Adam(self.critic.parameters(), lr=lr_c)
        # 选取损失函数
        self.mse_loss = nn.MSELoss()

    def sample(self):
        indices = np.random.choice(self.memory_capacity, size=self.batch_size)
        return self.memory[indices, :] 

    def choose_action(self, s):
        s = torch.FloatTensor(s)
        action = self.actor(s)
        return action.detach().numpy()

    def learn(self):

        # soft replacement and hard replacement
        # 用于更新target网络的参数
        if self.replacement['name'] == 'soft':
            # soft的意思是每次learn的时候更新部分参数
            tau = self.replacement['tau']
            a_layers = self.actor_target.named_children()
            c_layers = self.critic_target.named_children()
            for al in a_layers:
                al[1].weight.data.mul_((1-tau))
                al[1].weight.data.add_(tau * self.actor.state_dict()[al[0]+'.weight'])
                if al[1].bias is not None:
                    al[1].bias.data.mul_((1-tau))
                    al[1].bias.data.add_(tau * self.actor.state_dict()[al[0]+'.bias'])
            for cl in c_layers:
                cl[1].weight.data.mul_((1-tau))
                cl[1].weight.data.add_(tau * self.critic.state_dict()[cl[0]+'.weight'])
                if cl[1].bias is not None:
                    cl[1].bias.data.mul_((1-tau))
                    cl[1].bias.data.add_(tau * self.critic.state_dict()[cl[0]+'.bias'])
            
        else:
            # hard的意思是每隔一定的步数才更新全部参数
            if self.t_replace_counter % self.replacement['rep_iter'] == 0:
                self.t_replace_counter = 0
                a_layers = self.actor_target.named_children()
                c_layers = self.critic_target.named_children()
                for al in a_layers:
                    al[1].weight.data = self.actor.state_dict()[al[0]+'.weight']
                    if al[1].bias is not None:
                        al[1].bias.data = self.actor.state_dict()[al[0]+'.bias']
                for cl in c_layers:
                    cl[1].weight.data = self.critic.state_dict()[cl[0]+'.weight']
                    if cl[1].bias is not None:
                        cl[1].bias.data = self.critic.state_dict()[cl[0]+'.bias']
            
            self.t_replace_counter += 1

        # 从记忆库中采样bacth data
        bm = self.sample()
        bs = torch.FloatTensor(bm[:, : self.state_dim])
        bas = torch.FloatTensor(bm[:, self.state_dim: self.state_dim + self.action_state_dim])
        ba = torch.FloatTensor(bm[:, self.state_dim + self.action_state_dim: self.state_dim + self.action_state_dim + self.action_dim])
        br = torch.FloatTensor(bm[:, -self.state_dim - self.action_state_dim - 1: -self.state_dim - self.action_state_dim])
        bs_ = torch.FloatTensor(bm[:,-self.state_dim - self.action_state_dim: -self.action_state_dim])
        bas_ = torch.FloatTensor(bm[:,-self.action_state_dim:])
        
        # 训练Actor
        a = self.actor(bas)
        q = self.critic(bs, a)
        a_loss = -torch.mean(q)
        self.aopt.zero_grad()
        a_loss.backward(retain_graph=True)
        self.aopt.step()
        
        # 训练critic
        a_ = self.actor_target(bas_)
        q_ = self.critic_target(bs_, a_)
        q_target = br + self.gamma * q_
        q_eval = self.critic(bs, ba)
        td_error = self.mse_loss(q_target,q_eval)
        self.copt.zero_grad()
        td_error.backward()
        self.copt.step()

    def store_transition(self, s, sa, a, r, s_, sa_):
        transition = np.hstack((s, sa, a, [r], s_, sa_))
        index = self.pointer % self.memory_capacity
        self.memory[index, :] = transition
        self.pointer += 1

In [15]:
class Env:
    def __init__(self, T, a, b, r, p, c, W_0):
        self.t = 0
        self.W = W_0
        self.W_0 = W_0
        self.T = T
        self.a = a
        self.b = b
        self.r = r
        self.p = p
        self.c = c
        self.done = False
        
    def utility(self, W):
        return -np.exp(-self.c * W)/self.c
    
    def reset(self):
        self.t = 0
        self.W = self.W_0
        self.done = False
        return [self.t, self.W]
    
    def step(self, x):
        W_new = x[0] * (np.random.choice((self.a, self.b), p=(self.p, 1-self.p)) - self.r) + self.W * (1 + self.r)
        if self.t == 0:
            reward = self.utility(self.W)
        else:
            reward = self.utility(W_new) - self.utility(self.W)
        self.t += 1
        self.W = W_new
        if self.t > self.T - 1:
            self.done = True
        return [self.t, self.W], reward, self.done

In [16]:
def feature_select(s, is_actor = False, T = 10, r = 0.1):
    if is_actor:
        return [1]
    else:
        return [T - s[0], np.power(1+r, T-s[0]) * s[1]]

In [17]:
import time
if __name__ == '__main__':

    # hyper parameters
    VAR = 5  # control exploration
    MAX_EPISODES = 5000
    MAX_EP_STEPS = 20
    MEMORY_CAPACITY = 200
    REPLACEMENT = [
        dict(name='soft', tau=0.05),
        dict(name='hard', rep_iter=600)
    ][0]  # you can try different target replacement strategies
    
    T = 10
    a = 0.18
    b = 0.02
    r = 0.1
    p = 2./3.
    c = 1.
    W_0 = 1.
    
    # train
    env = Env(T, a, b, r, p, c, W_0)

    s_dim = 2
    as_dim = 1
    a_dim = 1
    ddpg = DDPG(state_dim=s_dim,
                action_state_dim = as_dim,
                action_dim=a_dim,
                replacement=REPLACEMENT,
                memory_capacity=MEMORY_CAPACITY)

    t1 = time.time()
    for i in range(MAX_EPISODES):
        s = env.reset()
        ep_reward = 0
        for j in range(MAX_EP_STEPS):
            # Add exploration noise
            s_s = feature_select(s, is_actor = False, T = env.T, r = env.r)
            a_s = feature_select(s, is_actor = True, T = env.T, r = env.r)
            a = ddpg.choose_action(a_s)
            a = np.random.normal(a, VAR)  # 在动作选择上添加随机噪声
            x = a * np.power(1 + env.r, env.t - env.T)

            s_, reward, done = env.step(x)
            
            s_s_ = feature_select(s_, is_actor = False, T = env.T, r = env.r)
            a_s_ = feature_select(s_, is_actor = True, T = env.T, r = env.r)
            ddpg.store_transition(s_s, a_s, a, reward, s_s_, a_s_)

            if ddpg.pointer > MEMORY_CAPACITY:
                VAR *= .99999  # decay the action randomness
                ddpg.learn()

            s = s_
            ep_reward += reward
            if done or j == MAX_EP_STEPS - 1:
                if i % 20 == 0:
                    print('Episode:', i, 'T:', j, ' Total Reward:', ep_reward, 'Last investment:', x,'Explore:' , VAR, )
                break

    print('Running time: ', time.time() - t1)

Episode: 0 T: 9  Total Reward: -0.15352174227194915 Last investment: [1.1497867] Explore: 5
Episode: 20 T: 9  Total Reward: -0.4295474040198248 Last investment: [8.98328255] Explore: 4.950224401048741
Episode: 40 T: 9  Total Reward: -0.10772976650732793 Last investment: [4.18822822] Explore: 4.052495411575132
Episode: 60 T: 9  Total Reward: -0.03926736319733007 Last investment: [2.00760447] Explore: 3.317570625153519
Episode: 80 T: 9  Total Reward: -0.14927372455422233 Last investment: [3.91309843] Explore: 2.7159253089946316
Episode: 100 T: 9  Total Reward: -0.15974435533910147 Last investment: [4.3279741] Explore: 2.2233890751598597
Episode: 120 T: 9  Total Reward: -0.11404649690882165 Last investment: [2.8227005] Explore: 1.8201748638552033
Episode: 140 T: 9  Total Reward: -0.0752107561987436 Last investment: [4.32199896] Explore: 1.4900840217414968
Episode: 160 T: 9  Total Reward: -0.2469661200204224 Last investment: [3.00593281] Explore: 1.219855540223493
Episode: 180 T: 9  Total 

Episode: 1460 T: 9  Total Reward: -15.40187240485617 Last investment: [11.11916475] Explore: 2.739399301839526e-06
Episode: 1480 T: 9  Total Reward: -0.14292070865609072 Last investment: [11.29397503] Explore: 2.24260603192552e-06
Episode: 1500 T: 9  Total Reward: -0.14366566741223893 Last investment: [11.39746126] Explore: 1.8359068030175548e-06
Episode: 1520 T: 9  Total Reward: -0.15451990476406582 Last investment: [11.5270976] Explore: 1.5029629553221862e-06
Episode: 1540 T: 9  Total Reward: -0.14543122238531303 Last investment: [11.64726098] Explore: 1.2303988641242583e-06
Episode: 1560 T: 9  Total Reward: -0.1465386371601038 Last investment: [11.81024027] Explore: 1.0072645899071656e-06
Episode: 1580 T: 9  Total Reward: -0.14619752860542798 Last investment: [11.95436717] Explore: 8.245959775027788e-07
Episode: 1600 T: 9  Total Reward: -0.14818921560658643 Last investment: [12.03838802] Explore: 6.750545317754409e-07
Episode: 1620 T: 9  Total Reward: -0.14711311276991504 Last inves

Episode: 2880 T: 9  Total Reward: 0.13659239532072928 Last investment: [12.33822389] Explore: 1.8517747147804963e-12
Episode: 2900 T: 9  Total Reward: -0.14905545015018748 Last investment: [12.3371237] Explore: 1.5159532027131892e-12
Episode: 2920 T: 9  Total Reward: -0.14887947503890728 Last investment: [12.33664946] Explore: 1.2410333149455402e-12
Episode: 2940 T: 9  Total Reward: -27.729513719747786 Last investment: [12.3358215] Explore: 1.0159704706241572e-12
Episode: 2960 T: 9  Total Reward: -0.15916660212390113 Last investment: [12.32449705] Explore: 8.317230365613247e-13
Episode: 2980 T: 9  Total Reward: -0.1591442140122943 Last investment: [12.32085922] Explore: 6.80889090331345e-13
Episode: 3000 T: 9  Total Reward: -0.398703947243795 Last investment: [12.31775024] Explore: 5.574090568045289e-13
Episode: 3020 T: 9  Total Reward: -0.14890761031093788 Last investment: [12.31695088] Explore: 4.563222718938177e-13
Episode: 3040 T: 9  Total Reward: -3.7101831518366364 Last investmen

Episode: 4300 T: 9  Total Reward: 0.07156907398694026 Last investment: [9.29765181] Explore: 1.2517596803057375e-18
Episode: 4320 T: 9  Total Reward: -0.14188797434919634 Last investment: [9.28654237] Explore: 1.0247515970708437e-18
Episode: 4340 T: 9  Total Reward: -0.4542659627198409 Last investment: [9.27049637] Explore: 8.389116954484093e-19
Episode: 4360 T: 9  Total Reward: -0.12555894577423293 Last investment: [9.24999237] Explore: 6.867740775147779e-19
Episode: 4380 T: 9  Total Reward: 0.08365330186349523 Last investment: [9.23835494] Explore: 5.622267946737428e-19
Episode: 4400 T: 9  Total Reward: -0.14154023455169606 Last investment: [9.22311609] Explore: 4.602663073611849e-19
Episode: 4420 T: 9  Total Reward: 0.08623006932264822 Last investment: [9.2119451] Explore: 3.7679647376968794e-19
Episode: 4440 T: 9  Total Reward: -1.3213738719123338 Last investment: [9.17360479] Explore: 3.0846399220323227e-19
Episode: 4460 T: 9  Total Reward: -0.23704312328438704 Last investment: [9