In [1]:
import time

import torch.nn as nn
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt
from IPython import display

from config import config
from rl_functions import plot_loss
from rl_trading_framework import *

In [2]:
price_data_all = get_data_torch(
    config['tickers'], config['train_start'], config['train_end'], 
    config['start_time'], config['end_time'], config['multiplier'], 
    config['data_freq'], config['device'])

# print(n_of_gpu, device)

Getting data:   0%|          | 0/101 [00:00<?, ?it/s]

In [3]:
train(0, 0, price_data_all, learning_rate=0.01)

Total Episodes: 40


Rolling training:   0%|          | 0/4 [00:00<?, ?it/s]

drop at episode: 8 retained pairs after drop: 4866 temperature: 1.8 retain_threshold: 0.28
drop at episode: 12 retained pairs after drop: 4566 temperature: 1.6 retain_threshold: 0.36000000000000004
drop at episode: 16 retained pairs after drop: 3930 temperature: 1.4000000000000001 retain_threshold: 0.44000000000000006
drop at episode: 20 retained pairs after drop: 2330 temperature: 1.2000000000000002 retain_threshold: 0.52
drop at episode: 24 retained pairs after drop: 1020 temperature: 1.0000000000000002 retain_threshold: 0.6
drop at episode: 28 retained pairs after drop: 288 temperature: 0.8000000000000003 retain_threshold: 0.6799999999999999
drop at episode: 32 retained pairs after drop: 94 temperature: 0.6000000000000003 retain_threshold: 0.7599999999999999
training done


In [None]:
t, n = price_data_all.shape
all_pairs = list(combinations(range(n), 2))

if full_sample_train:
    retained_pairs = generate_pairs(all_pairs, n_of_gpu, full_sample=full_sample_train)
else:
    retained_pairs = generate_pairs(all_pairs, n_of_gpu, k=batch_size_train)

batch_size_train = len(retained_pairs)
print('Total time steps:', t)
print('Total number of individual assets:', n)
print('Total number of possible pairs:', len(all_pairs))
print('Current training uses', batch_size_train, 'pairs')

In [None]:
state_encoding_model = StateEncodingModel(device, input_dim=8, lstm_hidden_dim=lstm_hidden_dim, num_layers=lstm_num_layers).to(device)
trading_policy_model = TradingPolicyModel(lstm_hidden_dim, mlp_hidden_dim, action_dim).to(device)
trading_agent = TradingAgent(state_encoding_model, trading_policy_model, optimizer_class, learning_rate)
env = TradingEnvironment(state_encoding_model, reg_rolling_window, portfolio_settings, action_dim, all_pairs, n_of_gpu)

if n_of_gpu > 1:
    state_encoding_model = nn.DataParallel(state_encoding_model)
    trading_policy_model = nn.DataParallel(trading_policy_model)


In [None]:
retain_threshold = initial_retain_threshold
temperature = initial_T
# episodes_per_rolling = initial_episodes_per_rolling

In [None]:
# to see updated loss and reward every episode, uncomment the next two lines and plot_loss(fig, ax, dh, ...)
fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(12, 8))     # display the initial figure and get a display handle, outside the main loop
dh = display.display(fig, display_id=True)

total_episodes = len(range(0, price_data_all.shape[0] - rolling_window_size, rolling_step_size)) * episodes_per_rolling
print('Total Episodes:', total_episodes)

drop_start = int(total_episodes * drop_start_ratio)
drop_end = int(total_episodes * drop_end_ratio)
next_drop = drop_start

episode_count = 0
retained_pairs_len = []
start_t = time.time()

for start_idx in tqdm(range(0, price_data_all.shape[0]-rolling_window_size, rolling_step_size), desc="Rolling training"):
    
    price_data = price_data_all[start_idx:start_idx+rolling_window_size, :]
    env.update_data(price_data)
    
    for episode in range(episodes_per_rolling):

        episode_count += 1
        # todo: check epsilon-greedy
        if episode_count <= total_episodes * exploration_fraction:
            method = 'random'
        else:
            method = 'greedy'
            
#         temperature = initial_T * (T_decay_rate ** episode_count)
#         retain_threshold = initial_retain_threshold * (retain_threshold_growth_rate ** episode_count)
#         retain_threshold = initial_retain_threshold + 0.01 * episode_count

        batch_size_train = len(retained_pairs)
        retained_pairs_len.append(batch_size_train)
            
        # 1. reset model and env, initiate observation
        if isinstance(state_encoding_model, nn.DataParallel):
            state_encoding_model.module.reset_state(batch_size_train//n_of_gpu, state_encoding_model)
        else:
            state_encoding_model.reset_state(batch_size_train, state_encoding_model)
            
        trading_agent.reset()
        env.reset(pair_indices_given=retained_pairs)
        ob_t = env.update_observation()
        done = False
        trajectory = []
        
        while not done:
            # 2. lstm decides a state

            encoded_state = state_encoding_model(ob_t)
            env.norm_port_value = (env.port_value / env.bt_settings['initial_cash']).unsqueeze(1)
            # unsqueeze dim from (batch_size_train) to (batch_size_train, 1) for torch.cat

            # 3. agent makes a decision
            action = trading_agent.choose_action(encoded_state, env.norm_port_value, method)
            # action is the desired position at time t

            # 4. calculate reward, update observations based on action
            reward, ob_t, done = env.step(action)
            trading_agent.rewards.append(reward)

            # 5. store state, action, reward
            trajectory.append((encoded_state, action, reward))
            
        # 6. learn after getting the whole trajectory
        trading_agent.rewards = torch.stack(trading_agent.rewards)
        trading_agent.learn(gamma=0.99)
        plot_loss(fig, ax1, dh, trading_agent.loss_history_all)
        plot_mean_reward(fig, ax2, dh, trading_agent.mean_reward_all)
#         plot_batch_size(fig, ax3, dh, retained_pairs_len)
        
        # 7. filter pairs
        if drop_start <= episode_count <= drop_end and episode_count >= next_drop and len(retained_pairs) > stop_drop_threshold:
            
            retained_pairs = trading_agent.retain_pairs(retained_pairs, alpha, beta, temperature, retain_threshold, adjust_for_n_gpu=True, n_of_gpu=n_of_gpu)
            next_drop = episode_count + int(total_episodes * drop_pairs_percentage_interval)
            temperature -= 0.2
            retain_threshold += 0.08
#             episodes_per_rolling += 10
            print('drop at episode:', episode_count, 'retained pairs after drop:', len(retained_pairs), 'temperature:', temperature, 'retain_threshold:', retain_threshold)

end_t = time.time()
print('Total training time:', round(end_t - start_t, 2), 'seconds')
           
plt.close()
    

In [None]:
fig, ax = plt.subplots(3, 1, figsize=(12, 24))

ax[0].plot(trading_agent.loss_history_all)
ax[0].set_title("Training Loss over Time")
ax[0].set_xlabel("Episode Step")
ax[0].set_ylabel("Loss")

ax[1].plot(trading_agent.mean_reward_all)
ax[1].set_title("Mean Rewards over Time")
ax[1].set_xlabel("Episode Step")
ax[1].set_ylabel("Mean Rewards")

ax[2].plot(retained_pairs_len)
ax[2].set_title("Number of Pairs Retained over Time")
ax[2].set_xlabel("Episode Step")
ax[2].set_ylabel("Number of Pairs")

plt.tight_layout()
plt.show()

In [None]:
# save models
save_model(state_encoding_model, model_folder_path, encoding_model_file_name)
save_model(trading_policy_model, model_folder_path, policy_model_file_name)

In [None]:
len(retained_pairs)

In [None]:
selected_pair_names = [f"{tickers[i]}-{tickers[j]}" for i, j in retained_pairs]

In [None]:
selected_pair_names

In [None]:
%store selected_pair_names
