In [1]:
pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading bayesian_optimization-1.4.3-py3-none-any.whl (18 kB)
Collecting colorama>=0.4.6
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama, bayesian-optimization
  Attempting uninstall: colorama
    Found existing installation: colorama 0.4.4
    Uninstalling colorama-0.4.4:
      Successfully uninstalled colorama-0.4.4
Successfully installed bayesian-optimization-1.4.3 colorama-0.4.6
Note: you may need to restart the kernel to use updated packages.


In [6]:
import pickle
import matplotlib.pyplot as plt
import numpy as np
import chainer
import chainer.functions as F
import chainer.links as L
import copy
import time
import pandas as pd
import datetime

class Q_Network(chainer.Chain):
    def __init__(self, input_size, hidden_size, output_size):
        super(Q_Network, self).__init__(
            fc1 = L.Linear(input_size, hidden_size),
            fc2 = L.Linear(hidden_size, hidden_size),
            fc3 = L.Linear(hidden_size, output_size)
        )
    def __call__(self, x):
        h = F.relu(self.fc1(x))
        h = F.relu(self.fc2(h))
        y = self.fc3(h)
        return y
    def reset(self):
        self.zerograds()

class Environment1:
    def __init__(self, data, history_t=90):
        self.data = data
        self.history_t = history_t
        self.reset()
        
    def reset(self):
        self.t = 0
        self.done = False
        self.profits = 10000
        self.count = []
        self.positions = []
        self.position_value = 0
        self.brok_rate = 0.0009
        self.max_trade_percent = 0.8
        self.tbrokerage = 0
        self.history = [0 for _ in range(self.history_t)]
        self.Act0 = 0
        self.Act1 = 0
        self.Act2 = 0
        self.RW_p = 0
        self.RW_n = 0
        self.RW_p_v = 0
        self.RW_n_v = 0
        return [self.position_value] + self.history
    
    def step(self, act, amount):
        if self.t >= len(self.data) - 1:
            self.done = True
            return [self.position_value] + self.history, 0, self.done

        reward = 0
        if act == 1:
            if self.profits != 0:
                max_trade_amount = self.profits * self.max_trade_percent
                stock_price = self.data.iloc[self.t, :]['Close']
                count = max_trade_amount / stock_price
                buyin = stock_price * count
                self.profits -= buyin
                self.positions.append(stock_price)
                self.count.append(count)
                self.Act1 += 1
        elif act == 2:
            if len(self.positions) > 0:
                sell_ratio = self.determine_sell_ratio(amount)
                num_positions_to_sell = int(len(self.positions) * sell_ratio)
                for i in range(num_positions_to_sell):
                    sell_price = self.data.iloc[self.t, :]['Close']
                    buy_price = self.positions[i]
                    count = self.count[i]
                    abs_num = (sell_price - buy_price) * count
                    if abs_num > 0:
                        self.RW_p += 1
                        self.RW_p_v += abs_num
                    else:
                        self.RW_n += 1
                        self.RW_n_v += abs(abs_num)
                    reward += sell_price * count
                    self.profits += sell_price * count
                self.positions = self.positions[num_positions_to_sell:]
                self.count = self.count[num_positions_to_sell:]
                self.Act2 += 1    
        else:
            self.Act0 += 1
        
        self.t += 1
        if self.t >= len(self.data):
            self.done = True
        self.position_value = 0
        for p in self.positions:
            self.position_value += (self.data.iloc[self.t, :]['Close'] - p)
        self.history.pop(0)
        self.history.append(self.data.iloc[self.t, :]['Close'] - self.data.iloc[(self.t-1), :]['Close'])
        
        if reward > 0:
            reward = 1
        elif reward < 0:
            reward = -1
        
        return [self.position_value] + self.history, reward, self.done
    
    def determine_sell_ratio(self, signal_strength):
        thresholds = [0.2, 0.5, 0.8]
        ratios = [0.25, 0.5, 0.75]
        for i, threshold in enumerate(thresholds):
            if signal_strength < threshold:
                return ratios[i]
        return ratios[0]

def train_dqn(env, Q, epoch_num=1):
    Q_ast = copy.deepcopy(Q)
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(Q)

    step_max = len(env.data)-1
    memory_size = 200
    batch_size = 20
    epsilon = 1.0
    epsilon_decrease = 1e-3
    epsilon_min = 0.1
    start_reduce_epsilon = 200
    train_freq = 10
    update_q_freq = 20
    gamma = 0.97
    show_log_freq = 1
    confidence_threshold_buy = 0.1
    confidence_threshold_sell = 0.8
    memory = []
    total_step = 0
    total_rewards = []
    total_losses = []
    start = time.time()
    
    for epoch in range(epoch_num):
        pobs = env.reset()
        step = 0
        done = False
        total_reward = 0
        total_loss = 0
        RandAct = 0
        NRandAct = 0
        while not done and step < step_max:
            pact = np.random.randint(3)
            amount = 0.25
            if np.random.rand() > epsilon:
                q_values = Q(np.array(pobs, dtype=np.float32).reshape(1, -1))
                softmax_q_values = F.softmax(q_values).data
                amount = softmax_q_values.ravel()[pact]
                pact = np.random.choice(3, p=softmax_q_values.ravel())
                if pact == 2:
                    if amount < confidence_threshold_sell:
                        pact = 0
                if pact == 1:
                    if amount < confidence_threshold_buy:
                        pact = 0
                NRandAct+=1
            else:
                RandAct+=1
                if pact==0: amount = 0.5
            obs, reward, done = env.step(pact,amount)
            memory.append((pobs, pact, reward, obs, done))
            if len(memory) > memory_size:
                memory.pop(0)
            if len(memory) == memory_size:
                if total_step % train_freq == 0:
                    shuffled_memory = np.random.permutation(memory)
                    memory_idx = range(len(shuffled_memory))
                    for i in memory_idx[::batch_size]:
                        batch = np.array(shuffled_memory[i:i+batch_size])
                        b_pobs = np.array(batch[:, 0].tolist(), dtype=np.float32).reshape(batch_size, -1)
                        b_pact = np.array(batch[:, 1].tolist(), dtype=np.int32)
                        b_reward = np.array(batch[:, 2].tolist(), dtype=np.int32)
                        b_obs = np.array(batch[:, 3].tolist(), dtype=np.float32).reshape(batch_size, -1)
                        b_done = np.array(batch[:, 4].tolist(), dtype=np.bool)
                        q = Q(b_pobs)
                        maxq = np.max(Q_ast(b_obs).data, axis=1)
                        target = copy.deepcopy(q.data)
                        for j in range(batch_size):
                            target[j, b_pact[j]] = b_reward[j]+gamma*maxq[j]*(not b_done[j])
                        Q.reset()
                        loss = F.mean_squared_error(q, target)
                        total_loss += loss.data
                        loss.backward()
                        optimizer.update()
                if total_step % update_q_freq == 0:
                    Q_ast = copy.deepcopy(Q)
            if epsilon > epsilon_min and total_step > start_reduce_epsilon:
                epsilon -= epsilon_decrease
            total_reward += reward
            pobs = obs
            step += 1
            total_step += 1
        total_rewards.append(total_reward)
        total_losses.append(total_loss)
        if (epoch+1) % show_log_freq == 0:
            log_reward = sum(total_rewards[((epoch+1)-show_log_freq):])/show_log_freq
            log_loss = sum(total_losses[((epoch+1)-show_log_freq):])/show_log_freq
            log_profits = env.profits
            if isinstance(env.count, list) and len(env.count) != 0:
                for i in range(len(env.positions)):
                    log_profits += env.count[i] * env.data.iloc[env.t, :]['Close']
            else:
                log_profits = env.profits
            elapsed_time = time.time()-start
            print(f"Training Epoch {epoch+1} - Profit: {log_profits}")
            start = time.time()       
    return Q, total_losses, total_rewards

def test_dqn(test_env, Q, confidence_threshold_buy=0.1, confidence_threshold_sell=0.8):
    pobs = test_env.reset()
    test_env.profits = 10000
    test_acts = []
    test_rewards = []
    estimated_total_assets = [test_env.profits]

    while not test_env.done:
        q_values = Q(np.array(pobs, dtype=np.float32).reshape(1, -1))
        softmax_q_values = F.softmax(q_values).data
        pact = np.random.choice(3, p=softmax_q_values.ravel())
        amount = softmax_q_values.ravel()[pact]
        if pact == 2 and amount < confidence_threshold_sell:
            pact = 0
        elif pact == 1 and amount < confidence_threshold_buy:
            pact = 0
        obs, reward, done = test_env.step(pact, amount)
        test_acts.append(pact)
        test_rewards.append(reward)
        pobs = obs
        stock_value = sum(c * test_env.data.iloc[test_env.t]['Close'] for c in test_env.count)
        total_assets = test_env.profits + stock_value
        estimated_total_assets.append(total_assets)

    final_profits = total_assets
    return test_acts, test_rewards, final_profits

In [7]:
from bayes_opt import BayesianOptimization
import numpy as np
import pandas as pd
import datetime
import chainer
import chainer.functions as F
import chainer.links as L
import copy
import time
import matplotlib.pyplot as plt

# 你的 Environment1, Q_Network, train_dqn 和 test_dqn 定義
# 省略前面的程式碼定義以便於展示貝葉斯優化的重點部分

def optimize_hyperparameters(data, start_date_2010, end_date_2019, train_days_2010_2019, test_days_2010_2019):
    # 資料分割
    test_start_date_2010_2019 = start_date_2010 + datetime.timedelta(days=train_days_2010_2019)
    train_end_date_2010_2019 = test_start_date_2010_2019 - datetime.timedelta(days=1)

    train = data[start_date_2010.date():train_end_date_2010_2019.date()]
    test = data[test_start_date_2010_2019.date():end_date_2019.date()]
    train_env = Environment1(train)
    test_env = Environment1(test)
    
    # 預訓練 Q 網絡
    Q = Q_Network(input_size=train_env.history_t + 1, hidden_size=100, output_size=3)
    Q, total_losses, total_rewards = train_dqn(train_env, Q, epoch_num=29)
    chainer.serializers.save_npz(f'Q_network_epoch_{29}.npz', Q)
    
    def objective(confidence_threshold_buy, confidence_threshold_sell):
        avg_profits = []
        for _ in range(10):
            _, _, test_profits = test_dqn(test_env, Q, confidence_threshold_buy=confidence_threshold_buy, confidence_threshold_sell=confidence_threshold_sell)
            avg_profits.append(test_profits)
        return np.mean(avg_profits)
    
    pbounds = {
        'confidence_threshold_buy': (0.1, 0.8),
        'confidence_threshold_sell': (0.1, 0.8)
    }
    
    optimizer = BayesianOptimization(
        f=objective,
        pbounds=pbounds,
        random_state=1,
    )
    
    optimizer.maximize(
        init_points=10,
        n_iter=30,
    )
    
    print(optimizer.max)
    
data = pd.read_csv('C:/Users/User/Documents/RL_/Data/Stocks/SPY_Both.txt')
data['Date'] = pd.to_datetime(data['Date'])
data = data.set_index('Date')

start_date_2010 = datetime.datetime(2010, 1, 1)
end_date_2019 = datetime.datetime(2019, 12, 31)
total_days_2010_2019 = (end_date_2019 - start_date_2010).days
train_days_2010_2019 = int(total_days_2010_2019 * 0.9)
test_days_2010_2019 = total_days_2010_2019 - train_days_2010_2019

optimize_hyperparameters(data, start_date_2010, end_date_2019, train_days_2010_2019, test_days_2010_2019)


  shuffled_memory = np.random.permutation(memory)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  b_done = np.array(batch[:, 4].tolist(), dtype=np.bool)


Training Epoch 1 - Profit: 20003.57970533339
Training Epoch 2 - Profit: 18577.855961387984
Training Epoch 3 - Profit: 19609.998555795682
Training Epoch 4 - Profit: 16317.31628373102
Training Epoch 5 - Profit: 20102.051661886133
Training Epoch 6 - Profit: 18226.62702733439
Training Epoch 7 - Profit: 20154.616679237843
Training Epoch 8 - Profit: 17949.16792134716
Training Epoch 9 - Profit: 17271.266355331943
Training Epoch 10 - Profit: 19848.32463457443
Training Epoch 11 - Profit: 24513.94427490572
Training Epoch 12 - Profit: 21588.96760580288
Training Epoch 13 - Profit: 18960.494195512147
Training Epoch 14 - Profit: 20044.86143444289
Training Epoch 15 - Profit: 18093.89619653825
Training Epoch 16 - Profit: 20282.034821410994
Training Epoch 17 - Profit: 22130.658445957986
Training Epoch 18 - Profit: 19754.342378382422
Training Epoch 19 - Profit: 21955.46350700025
Training Epoch 20 - Profit: 25667.916431143363
Training Epoch 21 - Profit: 22743.57464098319
Training Epoch 22 - Profit: 20923

In [17]:
import pickle
import matplotlib.pyplot as plt
import numpy as np
import chainer
import chainer.functions as F
import chainer.links as L
import copy
import time
import pandas as pd
import datetime
from bayes_opt import BayesianOptimization

class Q_Network(chainer.Chain):
    def __init__(self, input_size, hidden_size, output_size):
        super(Q_Network, self).__init__(
            fc1 = L.Linear(input_size, hidden_size),
            fc2 = L.Linear(hidden_size, hidden_size),
            fc3 = L.Linear(hidden_size, output_size)
        )
    def __call__(self, x):
        h = F.relu(self.fc1(x))
        h = F.relu(self.fc2(h))
        y = self.fc3(h)
        return y
    def reset(self):
        self.zerograds()

class Environment1:
    def __init__(self, data, history_t=90):
        self.data = data
        self.history_t = history_t
        self.reset()
        
    def reset(self):
        self.t = 0
        self.done = False
        self.profits = 10000
        self.count = []
        self.positions = []
        self.position_value = 0
        self.brok_rate = 0.0009
        self.max_trade_percent = 0.8
        self.tbrokerage = 0
        self.history = [0 for _ in range(self.history_t)]
        self.Act0 = 0
        self.Act1 = 0
        self.Act2 = 0
        self.RW_p = 0
        self.RW_n = 0
        self.RW_p_v = 0
        self.RW_n_v = 0
        return [self.position_value] + self.history
    
    def step(self, act, amount):
        if self.t >= len(self.data) - 1:
            self.done = True
            return [self.position_value] + self.history, 0, self.done

        reward = 0
        if act == 1:
            if self.profits != 0:
                max_trade_amount = self.profits * self.max_trade_percent
                stock_price = self.data.iloc[self.t, :]['Close']
                count = max_trade_amount / stock_price
                buyin = stock_price * count
                self.profits -= buyin
                self.positions.append(stock_price)
                self.count.append(count)
                self.Act1 += 1
        elif act == 2:
            if len(self.positions) > 0:
                sell_ratio = self.determine_sell_ratio(amount)
                num_positions_to_sell = int(len(self.positions) * sell_ratio)
                for i in range(num_positions_to_sell):
                    sell_price = self.data.iloc[self.t, :]['Close']
                    buy_price = self.positions[i]
                    count = self.count[i]
                    abs_num = (sell_price - buy_price) * count
                    if abs_num > 0:
                        self.RW_p += 1
                        self.RW_p_v += abs_num
                    else:
                        self.RW_n += 1
                        self.RW_n_v += abs(abs_num)
                    reward += sell_price * count
                    self.profits += sell_price * count
                self.positions = self.positions[num_positions_to_sell:]
                self.count = self.count[num_positions_to_sell:]
                self.Act2 += 1    
        else:
            self.Act0 += 1
        
        self.t += 1
        if self.t >= len(self.data):
            self.done = True
        self.position_value = 0
        for p in self.positions:
            self.position_value += (self.data.iloc[self.t, :]['Close'] - p)
        self.history.pop(0)
        self.history.append(self.data.iloc[self.t, :]['Close'] - self.data.iloc[(self.t-1), :]['Close'])
        
        if reward > 0:
            reward = 1
        elif reward < 0:
            reward = -1
        
        return [self.position_value] + self.history, reward, self.done
    
    def determine_sell_ratio(self, signal_strength):
        thresholds = [0.2, 0.5, 0.8]
        ratios = [0.25, 0.5, 0.75]
        for i, threshold in enumerate(thresholds):
            if signal_strength < threshold:
                return ratios[i]
        return ratios[0]

def train_dqn(env, Q, epoch_num=1):
    Q_ast = copy.deepcopy(Q)
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(Q)

    step_max = len(env.data)-1
    memory_size = 200
    batch_size = 20
    epsilon = 1.0
    epsilon_decrease = 1e-3
    epsilon_min = 0.1
    start_reduce_epsilon = 200
    train_freq = 10
    update_q_freq = 20
    gamma = 0.97
    show_log_freq = 1
    confidence_threshold_buy = 0.11803540036662069
    confidence_threshold_sell = 0.5689542195973996
    memory = []
    total_step = 0
    total_rewards = []
    total_losses = []
    start = time.time()
    
    for epoch in range(epoch_num):
        pobs = env.reset()
        step = 0
        done = False
        total_reward = 0
        total_loss = 0
        RandAct = 0
        NRandAct = 0
        while not done and step < step_max:
            pact = np.random.randint(3)
            amount = 0.25
            if np.random.rand() > epsilon:
                q_values = Q(np.array(pobs, dtype=np.float32).reshape(1, -1))
                softmax_q_values = F.softmax(q_values).data
                amount = softmax_q_values.ravel()[pact]
                pact = np.random.choice(3, p=softmax_q_values.ravel())
                if pact == 2:
                    if amount < confidence_threshold_sell:
                        pact = 0
                if pact == 1:
                    if amount < confidence_threshold_buy:
                        pact = 0
                NRandAct+=1
            else:
                RandAct+=1
                if pact==0: amount = 0.5
            obs, reward, done = env.step(pact,amount)
            memory.append((pobs, pact, reward, obs, done))
            if len(memory) > memory_size:
                memory.pop(0)
            if len(memory) == memory_size:
                if total_step % train_freq == 0:
                    shuffled_memory = np.random.permutation(memory)
                    memory_idx = range(len(shuffled_memory))
                    for i in memory_idx[::batch_size]:
                        batch = np.array(shuffled_memory[i:i+batch_size])
                        b_pobs = np.array(batch[:, 0].tolist(), dtype=np.float32).reshape(batch_size, -1)
                        b_pact = np.array(batch[:, 1].tolist(), dtype=np.int32)
                        b_reward = np.array(batch[:, 2].tolist(), dtype=np.int32)
                        b_obs = np.array(batch[:, 3].tolist(), dtype=np.float32).reshape(batch_size, -1)
                        b_done = np.array(batch[:, 4].tolist(), dtype=np.bool)
                        q = Q(b_pobs)
                        maxq = np.max(Q_ast(b_obs).data, axis=1)
                        target = copy.deepcopy(q.data)
                        for j in range(batch_size):
                            target[j, b_pact[j]] = b_reward[j]+gamma*maxq[j]*(not b_done[j])
                        Q.reset()
                        loss = F.mean_squared_error(q, target)
                        total_loss += loss.data
                        loss.backward()
                        optimizer.update()
                if total_step % update_q_freq == 0:
                    Q_ast = copy.deepcopy(Q)
            if epsilon > epsilon_min and total_step > start_reduce_epsilon:
                epsilon -= epsilon_decrease
            total_reward += reward
            pobs = obs
            step += 1
            total_step += 1
        total_rewards.append(total_reward)
        total_losses.append(total_loss)
        if (epoch+1) % show_log_freq == 0:
            log_reward = sum(total_rewards[((epoch+1)-show_log_freq):])/show_log_freq
            log_loss = sum(total_losses[((epoch+1)-show_log_freq):])/show_log_freq
            log_profits = env.profits
            if isinstance(env.count, list) and len(env.count) != 0:
                for i in range(len(env.positions)):
                    log_profits += env.count[i] * env.data.iloc[env.t, :]['Close']
            else:
                log_profits = env.profits
            elapsed_time = time.time()-start
            #print(f"Training Epoch {epoch+1} - Profit: {log_profits}")
            start = time.time()       
    return Q, total_losses, total_rewards

def test_dqn(test_env, Q, confidence_threshold_buy=0.1, confidence_threshold_sell=0.8):
    pobs = test_env.reset()
    test_env.profits = 10000
    test_acts = []
    test_rewards = []
    estimated_total_assets = [test_env.profits]

    while not test_env.done:
        q_values = Q(np.array(pobs, dtype=np.float32).reshape(1, -1))
        softmax_q_values = F.softmax(q_values).data
        pact = np.random.choice(3, p=softmax_q_values.ravel())
        amount = softmax_q_values.ravel()[pact]
        if pact == 2 and amount < confidence_threshold_sell:
            pact = 0
        elif pact == 1 and amount < confidence_threshold_buy:
            pact = 0
        obs, reward, done = test_env.step(pact, amount)
        test_acts.append(pact)
        test_rewards.append(reward)
        pobs = obs
        stock_value = sum(c * test_env.data.iloc[test_env.t]['Close'] for c in test_env.count)
        total_assets = test_env.profits + stock_value
        estimated_total_assets.append(total_assets)

    final_profits = total_assets
    return test_acts, test_rewards, final_profits

def optimize_hyperparameters(data, start_date_2010, end_date_2019, train_days_2010_2019, test_days_2010_2019):
    # 資料分割
    test_start_date_2010_2019 = start_date_2010 + datetime.timedelta(days=train_days_2010_2019)
    train_end_date_2010_2019 = test_start_date_2010_2019 - datetime.timedelta(days=1)

    train = data[start_date_2010.date():train_end_date_2010_2019.date()]
    test = data[test_start_date_2010_2019.date():end_date_2019.date()]
    train_env = Environment1(train)
    test_env = Environment1(test)
    
    def objective(epoch_num):
        Q = Q_Network(input_size=train_env.history_t + 1, hidden_size=100, output_size=3)
        chainer.serializers.load_npz(f'Q_network_epoch_{int(epoch_num)}.npz', Q)
        test_profits_list = []
        for _ in range(100):
            _, _, test_profits = test_dqn(test_env, Q)
            test_profits_list.append(test_profits)
        avg_test_profits = np.mean(test_profits_list)
        return avg_test_profits
    
    pbounds = {
        'epoch_num': (29, 299)
    }
    
    optimizer = BayesianOptimization(
        f=objective,
        pbounds=pbounds,
        random_state=1,
    )
    
    optimizer.maximize(
        init_points=10,
        n_iter=40,
    )
    
    print(optimizer.max)

data = pd.read_csv('C:/Users/User/Documents/RL_/Data/Stocks/SPY_Both.txt')
data['Date'] = pd.to_datetime(data['Date'])
data = data.set_index('Date')

start_date_2010 = datetime.datetime(2010, 1, 1)
end_date_2019 = datetime.datetime(2019, 12, 31)
total_days_2010_2019 = (end_date_2019 - start_date_2010).days
train_days_2010_2019 = int(total_days_2010_2019 * 0.9)
test_days_2010_2019 = total_days_2010_2019 - train_days_2010_2019

optimize_hyperparameters(data, start_date_2010, end_date_2019, train_days_2010_2019, test_days_2010_2019)


|   iter    |  target   | epoch_num |
-------------------------------------
| [0m1        [0m | [0m1.273e+04[0m | [0m141.6    [0m |
| [0m2        [0m | [0m1.27e+04 [0m | [0m223.5    [0m |
| [95m3        [0m | [95m1.276e+04[0m | [95m29.03    [0m |
| [95m4        [0m | [95m1.28e+04 [0m | [95m110.6    [0m |
| [0m5        [0m | [0m1.277e+04[0m | [0m68.62    [0m |
| [0m6        [0m | [0m1.267e+04[0m | [0m53.93    [0m |
| [0m7        [0m | [0m1.272e+04[0m | [0m79.29    [0m |
| [0m8        [0m | [0m1.272e+04[0m | [0m122.3    [0m |
| [0m9        [0m | [0m1.271e+04[0m | [0m136.1    [0m |
| [95m10       [0m | [95m1.292e+04[0m | [95m174.5    [0m |
| [0m11       [0m | [0m1.275e+04[0m | [0m181.1    [0m |
| [0m12       [0m | [0m1.272e+04[0m | [0m172.5    [0m |
| [0m13       [0m | [0m1.255e+04[0m | [0m294.4    [0m |
| [0m14       [0m | [0m1.272e+04[0m | [0m29.59    [0m |
| [0m15       [0m | [0m1.271e+04[0m | [0m1

In [3]:
import pickle
import matplotlib.pyplot as plt
import numpy as np
import chainer
import chainer.functions as F
import chainer.links as L
import copy
import time
import pandas as pd
import datetime
from bayes_opt import BayesianOptimization

class Q_Network(chainer.Chain):
    def __init__(self, input_size, hidden_size, output_size):
        super(Q_Network, self).__init__(
            fc1 = L.Linear(input_size, hidden_size),
            fc2 = L.Linear(hidden_size, hidden_size),
            fc3 = L.Linear(hidden_size, output_size)
        )
    def __call__(self, x):
        h = F.relu(self.fc1(x))
        h = F.relu(self.fc2(h))
        y = self.fc3(h)
        return y
    def reset(self):
        self.zerograds()

class Environment1:
    def __init__(self, data, history_t=90):
        self.data = data
        self.history_t = history_t
        self.reset()
        
    def reset(self):
        self.t = 0
        self.done = False
        self.profits = 10000
        self.count = []
        self.positions = []
        self.position_value = 0
        self.brok_rate = 0.0009
        self.max_trade_percent = 0.8
        self.tbrokerage = 0
        self.history = [0 for _ in range(self.history_t)]
        self.Act0 = 0
        self.Act1 = 0
        self.Act2 = 0
        self.RW_p = 0
        self.RW_n = 0
        self.RW_p_v = 0
        self.RW_n_v = 0
        return [self.position_value] + self.history
    
    def step(self, act, amount):
        if self.t >= len(self.data) - 1:
            self.done = True
            return [self.position_value] + self.history, 0, self.done

        reward = 0
        if act == 1:
            if self.profits != 0:
                max_trade_amount = self.profits * self.max_trade_percent
                stock_price = self.data.iloc[self.t, :]['Close']
                count = max_trade_amount / stock_price
                buyin = stock_price * count
                self.profits -= buyin
                self.positions.append(stock_price)
                self.count.append(count)
                self.Act1 += 1
        elif act == 2:
            if len(self.positions) > 0:
                sell_ratio = self.determine_sell_ratio(amount)
                num_positions_to_sell = int(len(self.positions) * sell_ratio)
                for i in range(num_positions_to_sell):
                    sell_price = self.data.iloc[self.t, :]['Close']
                    buy_price = self.positions[i]
                    count = self.count[i]
                    abs_num = (sell_price - buy_price) * count
                    if abs_num > 0:
                        self.RW_p += 1
                        self.RW_p_v += abs_num
                    else:
                        self.RW_n += 1
                        self.RW_n_v += abs(abs_num)
                    reward += sell_price * count
                    self.profits += sell_price * count
                self.positions = self.positions[num_positions_to_sell:]
                self.count = self.count[num_positions_to_sell:]
                self.Act2 += 1    
        else:
            self.Act0 += 1
        
        self.t += 1
        if self.t >= len(self.data):
            self.done = True
        self.position_value = 0
        for p in self.positions:
            self.position_value += (self.data.iloc[self.t, :]['Close'] - p)
        self.history.pop(0)
        self.history.append(self.data.iloc[self.t, :]['Close'] - self.data.iloc[(self.t-1), :]['Close'])
        
        if reward > 0:
            reward = 1
        elif reward < 0:
            reward = -1
        
        return [self.position_value] + self.history, reward, self.done
    
    def determine_sell_ratio(self, signal_strength):
        thresholds = [0.2, 0.5, 0.8]
        ratios = [0.25, 0.5, 0.75]
        for i, threshold in enumerate(thresholds):
            if signal_strength < threshold:
                return ratios[i]
        return ratios[0]

def train_dqn(env, Q, epoch_num=1):
    Q_ast = copy.deepcopy(Q)
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(Q)

    step_max = len(env.data)-1
    memory_size = 200
    batch_size = 20
    epsilon = 1.0
    epsilon_decrease = 1e-3
    epsilon_min = 0.1
    start_reduce_epsilon = 200
    train_freq = 10
    update_q_freq = 20
    gamma = 0.97
    show_log_freq = 1
    confidence_threshold_buy = 0.11803540036662069
    confidence_threshold_sell = 0.5689542195973996
    memory = []
    total_step = 0
    total_rewards = []
    total_losses = []
    start = time.time()
    
    for epoch in range(epoch_num):
        pobs = env.reset()
        step = 0
        done = False
        total_reward = 0
        total_loss = 0
        RandAct = 0
        NRandAct = 0
        while not done and step < step_max:
            pact = np.random.randint(3)
            amount = 0.25
            if np.random.rand() > epsilon:
                q_values = Q(np.array(pobs, dtype=np.float32).reshape(1, -1))
                softmax_q_values = F.softmax(q_values).data
                amount = softmax_q_values.ravel()[pact]
                pact = np.random.choice(3, p=softmax_q_values.ravel())
                if pact == 2:
                    if amount < confidence_threshold_sell:
                        pact = 0
                if pact == 1:
                    if amount < confidence_threshold_buy:
                        pact = 0
                NRandAct+=1
            else:
                RandAct+=1
                if pact==0: amount = 0.5
            obs, reward, done = env.step(pact,amount)
            memory.append((pobs, pact, reward, obs, done))
            if len(memory) > memory_size:
                memory.pop(0)
            if len(memory) == memory_size:
                if total_step % train_freq == 0:
                    shuffled_memory = np.random.permutation(memory)
                    memory_idx = range(len(shuffled_memory))
                    for i in memory_idx[::batch_size]:
                        batch = np.array(shuffled_memory[i:i+batch_size])
                        b_pobs = np.array(batch[:, 0].tolist(), dtype=np.float32).reshape(batch_size, -1)
                        b_pact = np.array(batch[:, 1].tolist(), dtype=np.int32)
                        b_reward = np.array(batch[:, 2].tolist(), dtype=np.int32)
                        b_obs = np.array(batch[:, 3].tolist(), dtype=np.float32).reshape(batch_size, -1)
                        b_done = np.array(batch[:, 4].tolist(), dtype=np.bool)
                        q = Q(b_pobs)
                        maxq = np.max(Q_ast(b_obs).data, axis=1)
                        target = copy.deepcopy(q.data)
                        for j in range(batch_size):
                            target[j, b_pact[j]] = b_reward[j]+gamma*maxq[j]*(not b_done[j])
                        Q.reset()
                        loss = F.mean_squared_error(q, target)
                        total_loss += loss.data
                        loss.backward()
                        optimizer.update()
                if total_step % update_q_freq == 0:
                    Q_ast = copy.deepcopy(Q)
            if epsilon > epsilon_min and total_step > start_reduce_epsilon:
                epsilon -= epsilon_decrease
            total_reward += reward
            pobs = obs
            step += 1
            total_step += 1
        total_rewards.append(total_reward)
        total_losses.append(total_loss)
        if (epoch+1) % show_log_freq == 0:
            log_reward = sum(total_rewards[((epoch+1)-show_log_freq):])/show_log_freq
            log_loss = sum(total_losses[((epoch+1)-show_log_freq):])/show_log_freq
            log_profits = env.profits
            if isinstance(env.count, list) and len(env.count) != 0:
                for i in range(len(env.positions)):
                    log_profits += env.count[i] * env.data.iloc[env.t, :]['Close']
            else:
                log_profits = env.profits
            elapsed_time = time.time()-start
            #print(f"Training Epoch {epoch+1} - Profit: {log_profits}")
            start = time.time()       
    return Q, total_losses, total_rewards

def test_dqn(test_env, Q, confidence_threshold_buy=0.1, confidence_threshold_sell=0.8):
    pobs = test_env.reset()
    test_env.profits = 10000
    test_acts = []
    test_rewards = []
    estimated_total_assets = [test_env.profits]

    while not test_env.done:
        q_values = Q(np.array(pobs, dtype=np.float32).reshape(1, -1))
        softmax_q_values = F.softmax(q_values).data
        pact = np.random.choice(3, p=softmax_q_values.ravel())
        amount = softmax_q_values.ravel()[pact]
        if pact == 2 and amount < confidence_threshold_sell:
            pact = 0
        elif pact == 1 and amount < confidence_threshold_buy:
            pact = 0
        obs, reward, done = test_env.step(pact, amount)
        test_acts.append(pact)
        test_rewards.append(reward)
        pobs = obs
        stock_value = sum(c * test_env.data.iloc[test_env.t]['Close'] for c in test_env.count)
        total_assets = test_env.profits + stock_value
        estimated_total_assets.append(total_assets)

    final_profits = total_assets
    return test_acts, test_rewards, final_profits

def optimize_hyperparameters(data, start_date_2010, end_date_2019, train_days_2010_2019, test_days_2010_2019):
    # 資料分割
    test_start_date_2010_2019 = start_date_2010 + datetime.timedelta(days=train_days_2010_2019)
    train_end_date_2010_2019 = test_start_date_2010_2019 - datetime.timedelta(days=1)

    train = data[start_date_2010.date():train_end_date_2010_2019.date()]
    test = data[test_start_date_2010_2019.date():end_date_2019.date()]
    train_env = Environment1(train)
    test_env = Environment1(test)
    
    def objective(epoch_num):
        Q = Q_Network(input_size=train_env.history_t + 1, hidden_size=100, output_size=3)
        chainer.serializers.load_npz(f'Q_network_epoch_{int(epoch_num)}.npz', Q)
        test_profits_list = []
        for _ in range(100):
            _, _, test_profits = test_dqn(test_env, Q)
            test_profits_list.append(test_profits)
        avg_test_profits = np.mean(test_profits_list)
        return avg_test_profits
    
    pbounds = {
        'epoch_num': (29, 299)
    }
    
    optimizer = BayesianOptimization(
        f=objective,
        pbounds=pbounds,
        random_state=1,
    )
    
    optimizer.maximize(
        init_points=10,
        n_iter=40,
    )
    
    print(optimizer.max)

data = pd.read_csv('C:/Users/User/Documents/RL_/Data/Stocks/SPY_Both.txt')
data['Date'] = pd.to_datetime(data['Date'])
data = data.set_index('Date')

start_date_2010 = datetime.datetime(2010, 1, 1)
end_date_2019 = datetime.datetime(2019, 12, 31)
total_days_2010_2019 = (end_date_2019 - start_date_2010).days
train_days_2010_2019 = int(total_days_2010_2019 * 0.9)
test_days_2010_2019 = total_days_2010_2019 - train_days_2010_2019

optimize_hyperparameters(data, start_date_2010, end_date_2019, train_days_2010_2019, test_days_2010_2019)


|   iter    |  target   | epoch_num |
-------------------------------------
| [0m1        [0m | [0m1.269e+04[0m | [0m141.6    [0m |
| [95m2        [0m | [95m1.275e+04[0m | [95m223.5    [0m |
| [95m3        [0m | [95m1.278e+04[0m | [95m29.03    [0m |
| [0m4        [0m | [0m1.277e+04[0m | [0m110.6    [0m |
| [0m5        [0m | [0m1.277e+04[0m | [0m68.62    [0m |
| [0m6        [0m | [0m1.267e+04[0m | [0m53.93    [0m |
| [0m7        [0m | [0m1.275e+04[0m | [0m79.29    [0m |
| [0m8        [0m | [0m1.269e+04[0m | [0m122.3    [0m |
| [0m9        [0m | [0m1.272e+04[0m | [0m136.1    [0m |
| [95m10       [0m | [95m1.293e+04[0m | [95m174.5    [0m |
| [0m11       [0m | [0m1.274e+04[0m | [0m180.2    [0m |
| [0m12       [0m | [0m1.278e+04[0m | [0m173.8    [0m |
| [0m13       [0m | [0m1.274e+04[0m | [0m173.8    [0m |
| [0m14       [0m | [0m1.292e+04[0m | [0m174.5    [0m |
| [0m15       [0m | [0m1.272e+04[0m | [0m2

In [2]:
import pickle
import matplotlib.pyplot as plt
import numpy as np
import chainer
import chainer.functions as F
import chainer.links as L
import copy
import time
import pandas as pd
import datetime
from bayes_opt import BayesianOptimization

class Q_Network(chainer.Chain):
    def __init__(self, input_size, hidden_size, output_size):
        super(Q_Network, self).__init__(
            fc1 = L.Linear(input_size, hidden_size),
            fc2 = L.Linear(hidden_size, hidden_size),
            fc3 = L.Linear(hidden_size, output_size)
        )
    def __call__(self, x):
        h = F.relu(self.fc1(x))
        h = F.relu(self.fc2(h))
        y = self.fc3(h)
        return y
    def reset(self):
        self.zerograds()

class Environment1:
    def __init__(self, data, history_t=90):
        self.data = data
        self.history_t = history_t
        self.reset()
        
    def reset(self):
        self.t = 0
        self.done = False
        self.profits = 10000
        self.count = []
        self.positions = []
        self.position_value = 0
        self.brok_rate = 0.0009
        self.max_trade_percent = 0.8
        self.tbrokerage = 0
        self.history = [0 for _ in range(self.history_t)]
        self.Act0 = 0
        self.Act1 = 0
        self.Act2 = 0
        self.RW_p = 0
        self.RW_n = 0
        self.RW_p_v = 0
        self.RW_n_v = 0
        return [self.position_value] + self.history
    
    def step(self, act, amount):
        if self.t >= len(self.data) - 1:
            self.done = True
            return [self.position_value] + self.history, 0, self.done

        reward = 0
        if act == 1:
            if self.profits != 0:
                max_trade_amount = self.profits * self.max_trade_percent
                stock_price = self.data.iloc[self.t, :]['Close']
                count = max_trade_amount / stock_price
                buyin = stock_price * count
                self.profits -= buyin
                self.positions.append(stock_price)
                self.count.append(count)
                self.Act1 += 1
        elif act == 2:
            if len(self.positions) > 0:
                sell_ratio = self.determine_sell_ratio(amount)
                num_positions_to_sell = int(len(self.positions) * sell_ratio)
                for i in range(num_positions_to_sell):
                    sell_price = self.data.iloc[self.t, :]['Close']
                    buy_price = self.positions[i]
                    count = self.count[i]
                    abs_num = (sell_price - buy_price) * count
                    if abs_num > 0:
                        self.RW_p += 1
                        self.RW_p_v += abs_num
                    else:
                        self.RW_n += 1
                        self.RW_n_v += abs(abs_num)
                    reward += sell_price * count
                    self.profits += sell_price * count
                self.positions = self.positions[num_positions_to_sell:]
                self.count = self.count[num_positions_to_sell:]
                self.Act2 += 1    
        else:
            self.Act0 += 1
        
        self.t += 1
        if self.t >= len(self.data):
            self.done = True
        self.position_value = 0
        for p in self.positions:
            self.position_value += (self.data.iloc[self.t, :]['Close'] - p)
        self.history.pop(0)
        self.history.append(self.data.iloc[self.t, :]['Close'] - self.data.iloc[(self.t-1), :]['Close'])
        
        if reward > 0:
            reward = 1
        elif reward < 0:
            reward = -1
        
        return [self.position_value] + self.history, reward, self.done
    
    def determine_sell_ratio(self, signal_strength):
        thresholds = [0.2, 0.5, 0.8]
        ratios = [0.25, 0.5, 0.75]
        for i, threshold in enumerate(thresholds):
            if signal_strength < threshold:
                return ratios[i]
        return ratios[0]

def train_dqn(env, Q, epoch_num=1):
    Q_ast = copy.deepcopy(Q)
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(Q)

    step_max = len(env.data)-1
    memory_size = 200
    batch_size = 20
    epsilon = 1.0
    epsilon_decrease = 1e-3
    epsilon_min = 0.1
    start_reduce_epsilon = 200
    train_freq = 10
    update_q_freq = 20
    gamma = 0.97
    show_log_freq = 1
    confidence_threshold_buy = 0.11803540036662069
    confidence_threshold_sell = 0.5689542195973996
    memory = []
    total_step = 0
    total_rewards = []
    total_losses = []
    start = time.time()
    
    for epoch in range(epoch_num):
        pobs = env.reset()
        step = 0
        done = False
        total_reward = 0
        total_loss = 0
        RandAct = 0
        NRandAct = 0
        while not done and step < step_max:
            pact = np.random.randint(3)
            amount = 0.25
            if np.random.rand() > epsilon:
                q_values = Q(np.array(pobs, dtype=np.float32).reshape(1, -1))
                softmax_q_values = F.softmax(q_values).data
                amount = softmax_q_values.ravel()[pact]
                pact = np.random.choice(3, p=softmax_q_values.ravel())
                if pact == 2:
                    if amount < confidence_threshold_sell:
                        pact = 0
                if pact == 1:
                    if amount < confidence_threshold_buy:
                        pact = 0
                NRandAct+=1
            else:
                RandAct+=1
                if pact==0: amount = 0.5
            obs, reward, done = env.step(pact,amount)
            memory.append((pobs, pact, reward, obs, done))
            if len(memory) > memory_size:
                memory.pop(0)
            if len(memory) == memory_size:
                if total_step % train_freq == 0:
                    shuffled_memory = np.random.permutation(memory)
                    memory_idx = range(len(shuffled_memory))
                    for i in memory_idx[::batch_size]:
                        batch = np.array(shuffled_memory[i:i+batch_size])
                        b_pobs = np.array(batch[:, 0].tolist(), dtype=np.float32).reshape(batch_size, -1)
                        b_pact = np.array(batch[:, 1].tolist(), dtype=np.int32)
                        b_reward = np.array(batch[:, 2].tolist(), dtype=np.int32)
                        b_obs = np.array(batch[:, 3].tolist(), dtype=np.float32).reshape(batch_size, -1)
                        b_done = np.array(batch[:, 4].tolist(), dtype=np.bool)
                        q = Q(b_pobs)
                        maxq = np.max(Q_ast(b_obs).data, axis=1)
                        target = copy.deepcopy(q.data)
                        for j in range(batch_size):
                            target[j, b_pact[j]] = b_reward[j]+gamma*maxq[j]*(not b_done[j])
                        Q.reset()
                        loss = F.mean_squared_error(q, target)
                        total_loss += loss.data
                        loss.backward()
                        optimizer.update()
                if total_step % update_q_freq == 0:
                    Q_ast = copy.deepcopy(Q)
            if epsilon > epsilon_min and total_step > start_reduce_epsilon:
                epsilon -= epsilon_decrease
            total_reward += reward
            pobs = obs
            step += 1
            total_step += 1
        total_rewards.append(total_reward)
        total_losses.append(total_loss)
        if (epoch+1) % show_log_freq == 0:
            log_reward = sum(total_rewards[((epoch+1)-show_log_freq):])/show_log_freq
            log_loss = sum(total_losses[((epoch+1)-show_log_freq):])/show_log_freq
            log_profits = env.profits
            if isinstance(env.count, list) and len(env.count) != 0:
                for i in range(len(env.positions)):
                    log_profits += env.count[i] * env.data.iloc[env.t, :]['Close']
            else:
                log_profits = env.profits
            elapsed_time = time.time()-start
            #print(f"Training Epoch {epoch+1} - Profit: {log_profits}")
            start = time.time()       
    return Q, total_losses, total_rewards

def test_dqn(test_env, Q, confidence_threshold_buy=0.1, confidence_threshold_sell=0.8):
    pobs = test_env.reset()
    test_env.profits = 10000
    test_acts = []
    test_rewards = []
    estimated_total_assets = [test_env.profits]

    while not test_env.done:
        q_values = Q(np.array(pobs, dtype=np.float32).reshape(1, -1))
        softmax_q_values = F.softmax(q_values).data
        pact = np.random.choice(3, p=softmax_q_values.ravel())
        amount = softmax_q_values.ravel()[pact]
        if pact == 2 and amount < confidence_threshold_sell:
            pact = 0
        elif pact == 1 and amount < confidence_threshold_buy:
            pact = 0
        obs, reward, done = test_env.step(pact, amount)
        test_acts.append(pact)
        test_rewards.append(reward)
        pobs = obs
        stock_value = sum(c * test_env.data.iloc[test_env.t]['Close'] for c in test_env.count)
        total_assets = test_env.profits + stock_value
        estimated_total_assets.append(total_assets)

    final_profits = total_assets
    return test_acts, test_rewards, final_profits

def optimize_hyperparameters(data, start_date_2010, end_date_2019, train_days_2010_2019, test_days_2010_2019):
    # 資料分割
    test_start_date_2010_2019 = start_date_2010 + datetime.timedelta(days=train_days_2010_2019)
    train_end_date_2010_2019 = test_start_date_2010_2019 - datetime.timedelta(days=1)

    train = data[start_date_2010.date():train_end_date_2010_2019.date()]
    test = data[test_start_date_2010_2019.date():end_date_2019.date()]
    train_env = Environment1(train)
    test_env = Environment1(test)
    
    def objective(epoch_num):
        Q = Q_Network(input_size=train_env.history_t + 1, hidden_size=100, output_size=3)
        chainer.serializers.load_npz(f'Q_network_epoch_{int(epoch_num)}.npz', Q)
        test_profits_list = []
        for _ in range(100):
            _, _, test_profits = test_dqn(test_env, Q)
            test_profits_list.append(test_profits)
        avg_test_profits = np.mean(test_profits_list)
        return avg_test_profits
    
    pbounds = {
        'epoch_num': (29, 299)
    }
    
    optimizer = BayesianOptimization(
        f=objective,
        pbounds=pbounds,
        random_state=1,
    )
    
    optimizer.maximize(
        init_points=10,
        n_iter=40,
    )
    
    print(optimizer.max)

data = pd.read_csv('C:/Users/User/Documents/RL_/Data/Stocks/SPY_Both.txt')
data['Date'] = pd.to_datetime(data['Date'])
data = data.set_index('Date')

start_date_2010 = datetime.datetime(2010, 1, 1)
end_date_2019 = datetime.datetime(2019, 12, 31)
total_days_2010_2019 = (end_date_2019 - start_date_2010).days
train_days_2010_2019 = int(total_days_2010_2019 * 0.9)
test_days_2010_2019 = total_days_2010_2019 - train_days_2010_2019

optimize_hyperparameters(data, start_date_2010, end_date_2019, train_days_2010_2019, test_days_2010_2019)


|   iter    |  target   | epoch_num |
-------------------------------------
| [0m1        [0m | [0m1.266e+04[0m | [0m141.6    [0m |
| [95m2        [0m | [95m1.276e+04[0m | [95m223.5    [0m |
| [95m3        [0m | [95m1.276e+04[0m | [95m29.03    [0m |
| [0m4        [0m | [0m1.276e+04[0m | [0m110.6    [0m |
| [95m5        [0m | [95m1.277e+04[0m | [95m68.62    [0m |
| [0m6        [0m | [0m1.268e+04[0m | [0m53.93    [0m |
| [0m7        [0m | [0m1.275e+04[0m | [0m79.29    [0m |
| [0m8        [0m | [0m1.266e+04[0m | [0m122.3    [0m |
| [0m9        [0m | [0m1.276e+04[0m | [0m136.1    [0m |
| [95m10       [0m | [95m1.296e+04[0m | [95m174.5    [0m |
| [0m11       [0m | [0m1.296e+04[0m | [0m174.4    [0m |
| [0m12       [0m | [0m1.28e+04 [0m | [0m181.2    [0m |
| [0m13       [0m | [0m1.275e+04[0m | [0m167.4    [0m |
| [0m14       [0m | [0m1.272e+04[0m | [0m177.0    [0m |
| [0m15       [0m | [0m1.272e+04[0m | [