# Sheet

In [1]:
pip install kneed

Collecting kneed
  Downloading kneed-0.8.5-py3-none-any.whl.metadata (5.5 kB)
Downloading kneed-0.8.5-py3-none-any.whl (10 kB)
Installing collected packages: kneed
Successfully installed kneed-0.8.5
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
Note: you may need to restart the kernel to use updated packages.


In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from collections import deque
import random
import pandas as pd
import warnings

import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import coint
from statsmodels.regression.linear_model import OLS
from statsmodels.tsa.vector_ar.vecm import coint_johansen
import statsmodels.api as sm
import itertools
from tqdm import tqdm
import seaborn as sns
from kneed import KneeLocator
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
warnings.filterwarnings('ignore')

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import namedtuple, deque
import random
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import euclidean

## Part 1: Data Preparation 
### Adjust the bond prices to mitigate the impact of the benchmark bond roll.

In [5]:
tr_dm = pd.read_excel('GENERIC BOND PRICE.xlsx', sheet_name='DM_PRICE')
tr_dm.Dates = pd.to_datetime(tr_dm.Dates)
tr_dm = tr_dm.set_index('Dates')

In [6]:
def calculate_dirty_price(clean_price, coupon, days_since_last_coupon):
    daily_coupon = coupon / 365  
    accrued_interest = daily_coupon * days_since_last_coupon
    return clean_price + accrued_interest

def calculate_days_since_last_coupon(date, last_coupon_date):
    return (date - last_coupon_date).days

def find_first_coupon_date(coupon_series):
    coupon_changes = coupon_series.diff().dropna()
    if len(coupon_changes) > 0:
        first_change_date = coupon_changes.index[0]
        return first_change_date - pd.Timedelta(days=180)  # All coupon paid 6 months before the change
    else:
        return coupon_series.index[0] 
    

In [7]:
dirty_price_series = {}
for column in tr_dm.columns:
    if column.endswith('Govt'):
        clean_price_series = tr_dm[column]
        coupon_series = tr_dm[f"{column} CPN"]

        first_coupon_date = find_first_coupon_date(coupon_series)
        last_coupon_date = first_coupon_date
        current_coupon = coupon_series.iloc[0]
        
        dirty_prices = []
        
        for date, clean_price in clean_price_series.items():
            if coupon_series[date] != current_coupon:
                last_coupon_date = date - pd.Timedelta(days=180)
                current_coupon = coupon_series[date]
            
            days_since_last_coupon = calculate_days_since_last_coupon(date, last_coupon_date)
            
            dirty_price = calculate_dirty_price(clean_price, current_coupon, days_since_last_coupon)
            dirty_prices.append(dirty_price)

            if days_since_last_coupon >= 180:
                last_coupon_date = date

        tr_dm[f"{column}_Dirty"] = pd.Series(dirty_prices, index=clean_price_series.index)

In [8]:

for bond_col in tr_dm.columns:
    if bond_col.endswith('Dirty'): 
        coupon_col = bond_col.replace('Govt_Dirty', 'Govt CPN')
        bond_col = bond_col.replace('_Dirty','')
        
        tr_dm[f"{bond_col} Adjusted"] = tr_dm[bond_col]
        
        coupon_changes = tr_dm[coupon_col].diff().fillna(0) != 0

        for change_date in tr_dm.index[coupon_changes]:

            price_on_change = tr_dm.loc[change_date, bond_col]
            
            previous_price = tr_dm.loc[tr_dm.index[tr_dm.index.get_loc(change_date) - 1], bond_col]
            
            price_diff = price_on_change - previous_price
            
            tr_dm.loc[change_date:, f"{bond_col} Adjusted"] -= price_diff

        tr_dm[f"{bond_col} Adjusted Return"] = tr_dm[f"{bond_col} Adjusted"].pct_change()


# col_adj = [i for i in tr_dm.columns if (i.endswith('Adjusted') or i.endswith('Adjusted Return'))]

col_adj_price_cpn = [i for i in tr_dm.columns if (i.endswith('Adjusted') or i.endswith('CPN'))]
col_adj_price = [i for i in tr_dm.columns if i.endswith('Adjusted')]

col_adj_return = [i for i in tr_dm.columns if i.endswith('Adjusted Return')]

tr_dm_net = tr_dm[col_adj_return].fillna(0)

tr_dm_net_price = tr_dm[col_adj_price].fillna(0)
tr_dm_for_shock = tr_dm[col_adj_price_cpn]

In [9]:
tr_dm_for_shock

Unnamed: 0_level_0,GTFRF10Y Govt CPN,GTITL10Y Govt CPN,GTSEK10Y Govt CPN,GTCHF10Y Govt CPN,GT10 Govt CPN,GTCAD10Y Govt CPN,GTAUD10Y Govt CPN,GTNZD10Y Govt CPN,GTGBP10Y Govt CPN,GTDEM10Y Govt CPN,...,GTITL10Y Govt Adjusted,GTSEK10Y Govt Adjusted,GTCHF10Y Govt Adjusted,GT10 Govt Adjusted,GTCAD10Y Govt Adjusted,GTAUD10Y Govt Adjusted,GTNZD10Y Govt Adjusted,GTGBP10Y Govt Adjusted,GTDEM10Y Govt Adjusted,GTJPY10Y Govt Adjusted
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-06-20,1.75,3.75,1.50,1.25,2.500,2.5,2.75,5.50,2.25,1.5,...,107.220,97.285,105.100,99.070312,101.820,92.385,107.542,95.875,101.440,100.122
2014-06-23,1.75,3.75,1.50,1.25,2.500,2.5,2.75,5.50,2.25,1.5,...,107.615,97.294,105.190,98.890625,101.464,92.262,107.414,96.050,101.630,100.150
2014-06-24,1.75,3.75,2.50,1.25,2.500,2.5,2.75,5.50,2.25,1.5,...,107.805,97.294,105.170,99.304688,101.908,92.694,107.249,96.085,101.660,100.169
2014-06-25,1.75,3.75,2.50,1.25,2.500,2.5,2.75,5.50,2.25,1.5,...,108.070,97.873,105.580,99.468750,102.022,93.084,107.324,96.785,102.166,100.245
2014-06-26,1.75,3.75,2.50,1.25,2.500,2.5,2.75,5.50,2.25,1.5,...,108.120,98.165,105.620,99.734375,102.284,93.090,107.102,96.865,102.370,100.264
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-09-06,3.00,3.85,2.25,0.00,3.875,3.0,3.75,4.25,4.25,2.6,...,118.718,100.244,96.620,93.644531,102.557,96.858,114.218,78.787,96.922,101.281
2024-09-09,3.00,3.85,2.25,0.00,3.875,3.0,3.75,4.25,4.25,2.6,...,118.755,100.450,96.558,93.722656,102.773,96.277,113.792,79.038,96.956,100.848
2024-09-10,3.00,3.85,2.25,0.00,3.875,3.0,3.75,4.25,4.25,2.6,...,119.071,100.842,96.737,94.191406,103.113,96.639,113.942,79.343,97.296,100.857
2024-09-11,3.00,3.85,2.25,0.00,3.875,3.0,3.75,4.25,4.25,2.6,...,119.428,101.373,97.016,94.113281,102.962,97.112,114.432,79.826,97.471,101.260


## Part 3: Identify Correlated Assets (Developed Market)
### Step1 : PCA (cluster by the first principal component)

In [11]:
from statsmodels.tsa.stattools import coint

def test_cointegration_in_clusters(data, cluster_dict):

    cointegrated_pairs = []
    for cluster_num in cluster_dict:
        asset_names = cluster_dict[cluster_num]
        asset_names = [i.replace(' Return', '') for i in asset_names]
        
        # Loop through each pair of assets in the cluster
        for i in range(len(asset_names)):
            for j in range(i + 1, len(asset_names)):
                asset1 = asset_names[i]
                asset2 = asset_names[j]
             
                series1 = data[asset1]
                series2 = data[asset2]
                
                # Perform the Engle-Granger cointegration test
                coint_t, p_value, _ = coint(series1, series2)
                
                # set a higher significant level (0.2) to avoid missing potential relationship
                if p_value < 0.2:  
                    cointegrated_pairs.append([asset1,asset2])
                else:
                    pass
                    # print(f"  {asset1} and {asset2} are NOT cointegrated (p-value: {p_value:.4f})")

    return cointegrated_pairs


In [12]:

def noncoherent_pair_cluster(data, data_price): # input: return data, price data
    # Step 1: Cluster by Principal component 1
    scaler = StandardScaler()
    asset_returns = pd.DataFrame(scaler.fit_transform(data), columns= data.columns)


    # Calculate the loadings of bond returns on PCs
    K = 1
    pca = PCA(n_components=K)
    pca.fit(asset_returns)
    loadings = pca.components_.T

    wcss = []
    for k in range(1, 10):
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(loadings)
        # Inertia: Sum of squared distances to closest cluster center
        wcss.append(kmeans.inertia_)  
        
    # Use the KneeLocator to detect the elbow point
    kneedle = KneeLocator(range(1, 10), wcss, S=1.0, curve='convex', direction='decreasing')

    # Get the optimal number of clusters
    optimal_clusters = kneedle.elbow


    # Clustering in the principal component space and using K-means to cluster different govt bonds
    kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
    clusters = kmeans.fit_predict(loadings)
    asset_names = asset_returns.columns

    cluster_dic = {}
    for cluster in range(optimal_clusters):
        cluster_assets = asset_names[clusters == cluster]
        cluster_dic[cluster + 1] =cluster_assets


    # Step 2: Find cointegration pairs
    cointegrated_pairs = test_cointegration_in_clusters(data_price, cluster_dic)

    # Step 3: Exclude pairs have similar PC2 and PC3
    K = 3
    pca = PCA(n_components=K)
    pca.fit(asset_returns)
    loadings = pca.components_.T
    explained_var = pca.explained_variance_ratio_

    loadings_df = pd.DataFrame(loadings, columns = ['PC1','PC2','PC3'], index = data.columns)

    pc2_diff_list = [abs(loadings_df.loc[f"{i} Return", 'PC2'] - loadings_df.loc[f"{j} Return", 'PC2']) for (i,j) in cointegrated_pairs]

    threshold_pc2 = np.quantile(pc2_diff_list, 0.5)

    coherent_pair = []
    noncoherent_pair = []

    for i in range(len(cointegrated_pairs)):
        asset1 = cointegrated_pairs[i][0]
        asset2 = cointegrated_pairs[i][1]

        pc1_diff = abs(loadings_df.loc[f"{asset1} Return", 'PC1']  - loadings_df.loc[f"{asset2} Return", 'PC1'])
        pc2_diff = abs(loadings_df.loc[f"{asset1} Return", 'PC2']  - loadings_df.loc[f"{asset2} Return", 'PC2'])

        if pc2_diff < threshold_pc2:
            coherent_pair.append((f'{asset1}', f'{asset2}'))
            # print(asset1, asset2, pc1_diff, 'coherent')
        else:
            noncoherent_pair.append((f'{asset1}', f'{asset2}'))
            # print(asset1, asset2, pc1_diff, 'noncoherent')
            

    return coherent_pair, noncoherent_pair


## Part 4: Naiive Rule-based Trading Algorithm

The code below implements a rule-based pair trading strategy that is based on the 2nd principal component loadings of the asset pair to construct a market-neutral spread.

Key Steps:
1. Rolling PCA Calculation:

Option 1: PCA is applied on the returns of only two selected assets. In this case, the first principal component (PC1) represents the common trend between the two selected assets.

Option 2: PCA can be applied to all 11 bonds, which would make the PC1 represent the broader market trend for all treasury bonds. PC2 represents the specific factors that diverge from the broader market, allowing for more targeted trading opportunities.

2. Rolling Loadings:

The PCA loadings are calculated on a rolling basis to avoid look-ahead bias over time. The strategy focuses on the loadings for the second principal component (PC2), which reflects the bond-specific factors that are relatively insensitive (orthogonal) to broader market movements captured by PC1.
The code extracts the rolling loadings of the selected assets on PC2 and uses these loadings to construct the spread.

3. Spread Construction:

The spread is calculated as a linear combination of the two assets' returns weighted by their PC2 loadings.
Then the weights are normalized based on the total absolute weight of the two assets to ensure that the portfolio remains balanced.

$ \text{Spread}  = w1 * \text{Asset 1 Return} - w2 * \text{Asset 2 Return}$

$ w1 = 1 $


$ w2 = w1 * \frac{\sigma 1}{\sigma 2} * \frac{PC loading Asset 1}{PC Loading Asset 2} $



1. Trading Signals:

Buy Signal: Generated when the z-score of the spread (number of standard deviations the spread deviates from its rolling mean) falls below a specified lower threshold. This implies the spread has diverged significantly, and the strategy goes long on the spread, expecting a reversal.
Sell Signal: Generated when the z-score rises above the upper threshold, indicating the spread has widened significantly. The strategy goes short on the spread, expecting a convergence.
Exit Signal: The strategy closes the positions when the spread reverts to a level closer to the mean, as defined by a close threshold.

5. Z-Score and Thresholds:

The z-score is calculated based on the rolling mean and rolling standard deviation of the spread. This z-score is used to quantify the divergence of the spread from its historical average.
The thresholds (upper, lower, and close) dictate when the strategy enters and exits positions.

6. Performance Evaluation:

The code tracks the cumulative returns of the pair trading strategy over time. It shifts the positions by one day to avoid look-ahead bias when calculating the returns for the next day.
The final cumulative returns are plotted to visualize the strategy's performance for the selected asset pair.

In [15]:
def rolling_pca_loadings(data, window_size, num_components=2):
    rolling_loadings = []
    
    for i in range(window_size, len(data) + 1):
        window_data = data[i - window_size:i]
        pca = PCA(n_components=num_components)
        pca.fit(window_data)
        loadings = pca.components_.T  
        rolling_loadings.append(loadings)
        
    return np.array(rolling_loadings)

def diff_sign_pc_loadings(data, window_size):
    loadings = rolling_pca_loadings(data, window_size, num_components=2)
    result = []
    for matrix in loadings:
        first_col = matrix[:, 0]
        second_col = matrix[:, 1]
        
        if np.sign(first_col[0]) == np.sign(first_col[1]):
            result.append(first_col)
        else:
            result.append(second_col)
    
    result_array = np.array(result)
    
    return result_array

In [16]:
# Non-coherent pair
def rule_based_strategy_info(data, window_size, upper_threshold,close_threshold, asset1,asset2):

    # 1. Rolling PCA Calculation:
    # option1: apply pca on two assets: pc1 represents the common trend between asset1 and asset 2 only
    asset1 = f'{asset1} Return'
    asset2 = f'{asset2} Return'
    selected_asset_returns = data[[f'{asset1}', f'{asset2}']]
    
    # 2. Rolling Loadings:
    rolling_loadings = diff_sign_pc_loadings(selected_asset_returns, window_size)

    pc_loading_asset1 = rolling_loadings[:, 0] 
    pc_loading_asset2 = rolling_loadings[:, 1] 

    # 3. Spread Construction:
    
    sigma_asset1 = data[f'{asset1}'].rolling(window=window_size).std().dropna()
    sigma_asset2 = data[f'{asset2}'].rolling(window=window_size).std().dropna()
    pc_loading_asset1 = pd.Series(pc_loading_asset1, index = sigma_asset1.index)
    pc_loading_asset2 = pd.Series(pc_loading_asset2, index = sigma_asset2.index)

    sigma_ratio = sigma_asset1 / sigma_asset2
    
    w1 = 1
    w2 = (w1 * sigma_ratio * pc_loading_asset1)/pc_loading_asset2
    
    spread = w1 * data[f'{asset1}'] - w2 * data[f'{asset2}']

    # 4. Trading Signals:
    rolling_mean = spread.rolling(window=window_size).mean()
    rolling_std = spread.rolling(window=window_size).std()

    # 5. Z-Score and Thresholds:
    z_score = (spread - rolling_mean) / rolling_std
    
    lower_threshold = - upper_threshold
    
    positions = pd.DataFrame(index=data.index, columns=['Position','Holdings_w1','Holdings_w2'])
    
    # Enter signal (spread deviates from the mean)
    positions['Position'] = np.where(z_score > upper_threshold, -1, 0)
    positions['Position'] = np.where(z_score < lower_threshold, 1, positions['Position'])
    
    # Exit signal (spread reverts to the mean)
    positions['Position'] = np.where((z_score > - close_threshold) & (positions['Position'] == 1), 0, positions['Position'])
    positions['Position'] = np.where((z_score < close_threshold) & (positions['Position'] == -1), 0, positions['Position'])
    
    positions['Holdings_w1'] = positions['Position'] * w1
    positions['Holdings_w2'] = positions['Position'] * w2


    # 6. Performance Evaluation:
    positions_shifted = positions.shift(1)  
    returns = positions_shifted['Holdings_w1'] * data[f'{asset1}'] + positions_shifted['Holdings_w2'] * data[f'{asset2}']
    initial_investment = 1
    cumulative_returns = (1+ returns).cumprod() -1

    return z_score, spread, returns, cumulative_returns, positions

In [17]:
# coherent pair
def rule_based_strategy(data, window_size, upper_threshold,close_threshold, asset1,asset2):

    # 1. Rolling PCA Calculation:
    # option1: apply pca on two assets: pc1 represents the common trend between asset1 and asset 2 only
    asset1 = f'{asset1} Return'
    asset2 = f'{asset2} Return'
    selected_asset_returns = data[[f'{asset1}', f'{asset2}']]
    
    # 2. Rolling Loadings:
    rolling_loadings = diff_sign_pc_loadings(selected_asset_returns, window_size)

    pc_loading_asset1 = rolling_loadings[:, 0] 
    pc_loading_asset2 = rolling_loadings[:, 1] 

    # 3. Spread Construction:
    
    sigma_asset1 = data[f'{asset1}'].rolling(window=window_size).std().dropna()
    sigma_asset2 = data[f'{asset2}'].rolling(window=window_size).std().dropna()
    pc_loading_asset1 = pd.Series(pc_loading_asset1, index = sigma_asset1.index)
    pc_loading_asset2 = pd.Series(pc_loading_asset2, index = sigma_asset2.index)

    sigma_ratio = sigma_asset1 / sigma_asset2
    
    w1 = 1
    w2 = (w1 * sigma_ratio * pc_loading_asset1)/pc_loading_asset2
    
    spread = w1 * data[f'{asset1}'] - w2 * data[f'{asset2}']

    # 4. Trading Signals:
    rolling_mean = spread.rolling(window=window_size).mean()
    rolling_std = spread.rolling(window=window_size).std()

    # 5. Z-Score and Thresholds:
    z_score = (spread - rolling_mean) / rolling_std
    
    lower_threshold = - upper_threshold
    
    positions = pd.DataFrame(index=data.index, columns=['Position','Holdings_w1','Holdings_w2'])
    
    positions['Position'] = np.where(z_score > upper_threshold, -1, 0)
    positions['Position'] = np.where(z_score < lower_threshold, 1, positions['Position'])
    
    # Exit signal (spread reverts to the mean)
    positions['Position'] = np.where((z_score > - close_threshold) & (positions['Position'] == 1), 0, positions['Position'])
    positions['Position'] = np.where((z_score < close_threshold) & (positions['Position'] == -1), 0, positions['Position'])
    
    positions['Holdings_w1'] = positions['Position'] * w1
    positions['Holdings_w2'] = positions['Position'] * w2


    # 6. Performance Evaluation:
    positions_shifted = positions.shift(1)  
    returns = positions_shifted['Holdings_w1'] * data[f'{asset1}'] + positions_shifted['Holdings_w2'] * data[f'{asset2}']
    initial_investment = 1
    cumulative_returns = (1+ returns).cumprod() -1

    return {
    'positions':positions['Position'],
    'z_score':z_score, 
    'cum_return':cumulative_returns
    }

### Grid search for the optimal parameters

In [19]:
def optimize_parameters(data, noncoherent_pair, window_size_list, upper_threshold_list, close_threshold_list):

    param_results = {}
    pair_results = {}
    
    # Try each parameter combination
    for window_size, upper_threshold, close_threshold in itertools.product(
            window_size_list, upper_threshold_list, close_threshold_list):
        
        if close_threshold >= upper_threshold:
            continue
            
        returns = []
        pair_details = []
        
        for asset_pair in noncoherent_pair:
            asset1 = asset_pair[0]#.replace(' Adjusted Return', '')
            asset2 = asset_pair[1]#.replace(' Adjusted Return', '')
            
            # Get return for this pair with current parameters
            result = rule_based_strategy(data, window_size, upper_threshold, close_threshold, asset1, asset2)
            returns.append(result['cum_returns'][-1])
            
            pair_details.append({'asset1': asset1, 'asset2': asset2, 'return': result})
        
        # Store average return for this parameter combination
        param_key = (window_size, upper_threshold, close_threshold)
        param_results[param_key] = {'avg_return': np.mean(returns),'pair_details': pair_details}

    
    # Find best parameter combination
    best_params = max(param_results.items(), key=lambda x: x[1]['avg_return'])
    window_size, upper_threshold, close_threshold = best_params[0]
    
    
    return upper_threshold, close_threshold, window_size


## Part 5: Reinforcement Learning Based Strategy

To enhance the existing rule-based strategy for pair trading using a Deep Q-Network (DQN), we can replace the rule-based signals with a reinforcement learning (RL) approach. The purpose for RL step is to find 'when' to trade and 'how much' to trade by interacting with observation space, action space and reward space.

Key Points:

Teachnique: DQN Strategy

Dynamic Pair Selection & Training Process
1. Formation Period (Year T): Select correlated pairs using PCA
2. Training Period (Year T+1): Train RL agent on selected pairs
3. Testing Period (Year T+2): Test agent on newly generated pairs
4. Repeat process by rolling forward one year for robust validation

![image.png](attachment:./attachment:853841c4-393a-4b7c-870b-290257aa81f2.png)

Model Architecture
1. Deep Q-Network (DQN) for automated trading decisions 
2. State space includes normalized spread, z-score, position metrics
3. Action space: Long (1), Neutral (0), Short (-1) positions
4. Reward = Return - beta * (RL action - rule-based strategy action)

Training Process:
1. the agent learns optimal entry and exit points through extensive episodes with different pairs and market conditions
2. Set the network with highest reward as target network, agent continuously learns to match or exceed this taret network performance
3. the agent develop resilience to both regime changes and switching between different asset pairs, making it truly adaptive to market dynamics.

In [22]:
# defines a new class called DQN that inherits from PyTorch's nn.Module. 

import torch
import torch.nn as nn
import torch.nn.functional as F

# todo: to be fine-tuned
class DQN(nn.Module):
    def __init__(self, state_size, action_size, hidden_sizes,activation = 'relu', dropout_rate=0.2):
        super(DQN, self).__init__()
        
        self.layers = nn.ModuleList()
        
        if activation == 'relu':
            act_fn = nn.ReLU()
        elif activation == 'leaky_relu':
            act_fn = nn.LeakyReLU(0.01)
        elif activation == 'elu':
            act_fn = nn.ELU()
        elif activation == 'selu':
            act_fn = nn.SELU()
        elif activation == 'tanh':
            act_fn = nn.Tanh()

    
        # Input layer
        # Input layer with proper initialization
        input_layer = nn.Linear(state_size, hidden_sizes[0])
        nn.init.xavier_uniform_(input_layer.weight)
        nn.init.zeros_(input_layer.bias)
        self.layers.append(input_layer)
        self.layers.append(act_fn)
        
        # Hidden layers with proper initialization
        for i in range(1, len(hidden_sizes)):
            hidden_layer = nn.Linear(hidden_sizes[i-1], hidden_sizes[i])
            nn.init.xavier_uniform_(hidden_layer.weight)
            nn.init.zeros_(hidden_layer.bias)
            self.layers.append(hidden_layer)
            self.layers.append(act_fn)
            
            if dropout_rate > 0:
                self.layers.append(nn.Dropout(dropout_rate))
        
        # Output layer with proper initialization
        output_layer = nn.Linear(hidden_sizes[-1], action_size)
        nn.init.xavier_uniform_(output_layer.weight)
        nn.init.zeros_(output_layer.bias)
        self.layers.append(output_layer)

    def forward(self, state):
        x = state
        for i, layer in enumerate(self.layers):
            x = layer(x)
        return x
    
# creates a named tuple to store experience replays. a convenient way to group related data.
Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state', 'done'))

In [23]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.memory = deque(maxlen=capacity)
    
    def push(self, *args):
        self.memory.append(Transition(*args))
    
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    
    def __len__(self):
        return len(self.memory)

In [24]:
class DQNAgent:
    def __init__(self, state_size, action_size,hidden_size, learning_rate=1e-5):
        self.hidden_size= hidden_size
        self.state_size = state_size
        self.action_size = action_size
        self.memory = ReplayBuffer(10000) # a replay buffer with a capacity of 10,000 experiences
        self.gamma = 0.95    # discount factor
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.1
        self.epsilon_decay = 0.995
        self.learning_rate = learning_rate
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.policy_net = DQN(state_size, action_size,hidden_size).to(self.device)
        self.target_net = DQN(state_size, action_size,hidden_size).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.batch_size = 32

        self.optimizer = optim.Adam(self.policy_net.parameters()) # update for policy network's weight
        self.training_rmse_history = []  # Track training RMSE
        self.q_value_history = []  # Store Q-value predictions
        self.actual_returns_history = []  # Store actual returns
        self.next_states = []

        
    def act(self, state, store_next_state = None):
        if random.random() <= self.epsilon: # exploration (random act)
            self.q_value_history.append(0)
            return random.randrange(self.action_size)
        
        with torch.no_grad(): 

            state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
            action_values = self.policy_net(state)
            action = action_values.max(1)[1].item()

            q_value = action_values[0][action].item()

            if not np.isnan(q_value):
                self.q_value_history.append(q_value)
            else:
                self.q_value_history.append(0)

            return action


    def step(self, state, action, reward, next_state, done):
        self.memory.push(state, action, reward, next_state, done) # add new experience to replay buffer

    def learn(self):

        if len(self.memory) < self.batch_size: # check if enough space in buffer
            return
        
        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))
        
        state_batch = torch.FloatTensor(batch.state).to(self.device)
        action_batch = torch.LongTensor(batch.action).unsqueeze(1).to(self.device)
        reward_batch = torch.FloatTensor(batch.reward).to(self.device)
        next_state_batch = torch.FloatTensor(np.array(batch.next_state)).to(self.device)


        # pass our batch of states through our policy network to get Q-values for all actions. 
        # use gather to select only the Q-values for the actions that were actually taken. This gives us our current estimate of the Q-values for our sampled state-action pairs
        current_q_values = self.policy_net(state_batch).gather(1, action_batch)

        # For each non-terminal next state, we compute the maximum Q-value using our target network. 
        # We use the target network (which is updated less frequently) to provide a more stable target for learning.
        next_states = torch.FloatTensor([s for s in batch.next_state if s is not None]).to(self.device)

        with torch.no_grad():
            if next_states.size(0) > 0:
                next_actions = self.policy_net(next_states).max(1)[1]
                next_state_values = self.target_net(next_states).gather(1, next_actions.unsqueeze(1))
            else:
                next_state_values = torch.zeros_like(current_q_values)

        expected_q_values = (next_state_values * self.gamma) + reward_batch.unsqueeze(1)


        # Weighted loss
        mse_loss = F.mse_loss(current_q_values, expected_q_values)
        # loss =  F.smooth_l1_loss(current_q_values, expected_q_values)
        training_rmse  = torch.sqrt(mse_loss)

        self.optimizer.zero_grad()
        mse_loss.backward()
        self.optimizer.step()

        self.training_rmse_history.append(training_rmse.item()) 

        return training_rmse.item()



    # as we have more training samples, we don't need to explore too often
    def decay_epsilon(self):
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

    def calculate_testing_rmse(self):
        if not self.next_states:
            return None
            
        with torch.no_grad():
 
            next_state = torch.FloatTensor(self.next_states).to(self.device)
            next_actions = self.policy_net(next_state).max(1)[1]
            next_state_values = self.target_net(next_state).gather(1, next_actions.unsqueeze(1))

            rewards = torch.FloatTensor(self.actual_returns_history).to(self.device)

            target_q_values = rewards + (self.gamma * next_state_values.squeeze())
            
            predicted_q_values = torch.FloatTensor(self.q_value_history).to(self.device)
        
            bellman_error = torch.sqrt(F.mse_loss(predicted_q_values, target_q_values))
            
            return bellman_error.item()

    
    def reset_test_history(self):

        self.q_value_history = []
        self.actual_returns_history = []
        self.next_states = []

In [25]:
class SinglePairTradingEnv:
    def __init__(self, asset_returns_raw, asset1, asset2, upper_threshold, close_threshold, window_size, beta):
        """
        Initialize single pair trading environment
        """
        self.asset_returns_raw = asset_returns_raw
        self.asset1 = asset1
        self.asset2 = asset2
        self.window_size = window_size
        self.upper_threshold = upper_threshold
        self.close_threshold = close_threshold
        self.beta = beta
        self.action_space = [-1, 0, 1]
        
        # Calculate spread for the pair
        self.spread, self.w2 = self._calculate_spread()
        
        # Initialize training variables
        self.reset()

    def _calculate_spread(self):
        """Calculate spread for the pair"""
        selected_asset_returns = self.asset_returns_raw[[f'{self.asset1} Return', f'{self.asset2} Return']]
        
        rolling_loadings = diff_sign_pc_loadings(selected_asset_returns, self.window_size)
        pc_loading_asset1 = rolling_loadings[:, 0]
        pc_loading_asset2 = rolling_loadings[:, 1]
        
        sigma_asset1 = self.asset_returns_raw[f'{self.asset1} Return'].rolling(window=self.window_size).std().dropna()
        sigma_asset2 = self.asset_returns_raw[f'{self.asset2} Return'].rolling(window=self.window_size).std().dropna()
        
        pc_loading_asset1 = pd.Series(pc_loading_asset1, index=sigma_asset1.index)
        pc_loading_asset2 = pd.Series(pc_loading_asset2, index=sigma_asset2.index)
        sigma_ratio = sigma_asset1 / sigma_asset2
        
        w1 = 1
        w2 = (w1 * sigma_ratio * pc_loading_asset2) / pc_loading_asset1
        total_weight = abs(w1) + abs(w2)
        w1 = w1 / total_weight
        w2 = w2 / total_weight

        spread = w1 * self.asset_returns_raw[f'{self.asset1} Return'] - w2 * self.asset_returns_raw[f'{self.asset2} Return']
        
        return spread, w2

    def reset(self):
        """Reset environment state"""
        # Reset position and portfolio
        self.position = 0
        self.previous_position = 0
        self.portfolio_value = 1
        self.steps_taken = 0
        self.cumulative_return = [self.portfolio_value]
        
        # Reset tracking variables
        self.portfolio_start_value = self.portfolio_value
        self.last_position_change = None
        self.position_start_value = None
        self.return_list = []
        self.action_history = []
        self.reward = 0
        
        self.current_step = self.window_size
        
        return self._get_normalized_state()

    def step(self, action):
        """Execute one step in the environment"""
        self.steps_taken += 1
        self.previous_position = self.position
        new_position = self.action_space[action]
        
        # Handle position changes
        portfolio_reward = 0
        if new_position != 0 and self.previous_position == 0:
            self.last_position_change = self.current_step
            self.position_start_value = self.portfolio_value
        elif new_position == 0 and self.previous_position != 0:
            self.last_position_change = None
            self.position_start_value = None
            portfolio_reward = self._calculate_portfolio_reward()

        self.position = new_position
        
        # Calculate rewards
        spread_return = self._calculate_immediate_reward()
        baseline_action = self._get_baseline_action()
        deviation_penalty = self.beta * abs(new_position - baseline_action)
        self.reward = spread_return - deviation_penalty

        # Update portfolio value
        self.return_list.append(1 + spread_return)
        self.cumulative_return = np.cumprod(self.return_list) -1
        self.portfolio_value = self.cumulative_return[-1]
        
        # Store action
        self.action_history.append({
            'step': self.current_step,
            'position': self.position,
            'reward': self.reward,
            'spread_return': spread_return,
            'portfolio_value': self.portfolio_value,
            'cumulative_return': self.cumulative_return
        })
        
        # Advance step
        self.current_step += 1
        
        # Episode ends at end of data
        done = self.current_step >= len(self.spread) - 1
        
        # Close position at end of episode
        if done and self.position != 0:
            self.position = 0
            portfolio_reward = self._calculate_portfolio_reward()
            self.reward += portfolio_reward
        
        return self._get_normalized_state(), self.reward, done, {
            'portfolio_value': self.portfolio_value,
            'spread_return': spread_return,
            'deviation_penalty': deviation_penalty
        }

    def _get_normalized_state(self):
        """Get normalized state representation"""
        spread = self.spread.iloc[self.current_step]
        z_score = self._calculate_z_score(spread)
        
        return np.array([
            self.position, 
            spread,
            z_score
        ])


    def _calculate_z_score(self, spread):
        if self.current_step < self.window_size:
            return 0  
        window = self.spread.iloc[self.current_step - self.window_size:self.current_step]
        std_dev = window.std()
        if std_dev == 0 or np.isnan(std_dev):
            return 0  
        return (spread - window.mean()) / std_dev

    def _calculate_portfolio_reward(self):
        if self.position_start_value is None:
            return 0
        return (self.portfolio_value - self.position_start_value) / self.position_start_value

    def _calculate_immediate_reward(self):
        asset1_return = self.asset_returns_raw[f'{self.asset1} Return'].iloc[self.current_step]
        asset2_return = self.asset_returns_raw[f'{self.asset2} Return'].iloc[self.current_step]
        prev_index = self.asset_returns_raw.index[self.current_step-1]
        w2_prev = self.w2.loc[prev_index]
        
        spread_return = asset1_return - w2_prev * asset2_return

        return self.previous_position * spread_return
    
    def _get_baseline_action(self):
        z_score = self._calculate_z_score(self.spread.iloc[self.current_step])
        lower_threshold = -1 * self.upper_threshold
        
        if z_score > self.upper_threshold and self.position < 0:  # short signal
            return -1
        elif z_score < lower_threshold and self.position > 0:  # long signal
            return 1
        elif abs(z_score) < self.close_threshold and self.position == 0:  # neutral
            return 0
        
        return self.position  
    

In [26]:
def train_single_pair_dqn(env, episodes, hidden_size, batch_size=32):
    state_size = 3  # position, spread, z-score
    action_size = len(env.action_space)
    
    agent = DQNAgent(state_size, action_size, hidden_size)
    
    # Initialize training metrics
    training_metrics = {
        'best_model_positions': [],
        'best_model_z_score': [],
        'best_portfolio_value': float('-inf'),
        'best_model_rmse': float('-inf'),
        'best_reward': float('-inf'),
        'best_cumulative_return': [],
        'best_state_dict': None,
        'training_rmse_history': [],  # Track RMSE across episodes
        'reward_history': []  # Track rewards across episodes
    }

    for episode in range(episodes):
        state = env.reset()
        episode_reward = 0
        done = False
        episode_training_rmse = []
        agent.reset_test_history()
        positions = []
        z_score = []

        while not done:
            action = agent.act(state)
            next_state, reward, done, info = env.step(action)
            
            positions.append(next_state[0])
            z_score.append(next_state[2])
            
            agent.memory.push(state, action, reward, next_state, done)
            if len(agent.memory) > batch_size:
                training_rmse = agent.learn()
                if training_rmse is not None:
                    episode_training_rmse.append(training_rmse)

            agent.actual_returns_history.append(reward)
            episode_reward += reward
            state = next_state

        # Store episode metrics
        training_metrics['reward_history'].append(episode_reward)
        avg_rmse = np.mean(episode_training_rmse) if episode_training_rmse else float('inf')
        training_metrics['training_rmse_history'].append(avg_rmse)
        
        # Update best model if performance improves
        if episode_reward > training_metrics['best_reward']:
            training_metrics['best_portfolio_value'] = env.portfolio_value
            training_metrics['best_reward'] = episode_reward
            training_metrics['best_state_dict'] = agent.policy_net.state_dict()
            training_metrics['best_cumulative_return'] = env.cumulative_return
            training_metrics['best_model_rmse'] = avg_rmse
            training_metrics['best_model_positions'] = positions
            training_metrics['best_model_z_score'] = z_score

            # Update target network with best model
            agent.target_net.load_state_dict(training_metrics['best_state_dict'])

        agent.decay_epsilon()

    # Set the final network to the best found
    agent.policy_net.load_state_dict(training_metrics['best_state_dict'])
    agent.target_net.load_state_dict(training_metrics['best_state_dict'])
    
    return agent, training_metrics

def backtest(env, agent):
    state = env.reset()
    done = False
    agent.reset_test_history()
    test_positions = []
    test_z_score = []

    while not done:
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)

        test_positions.append(next_state[0])
        test_z_score.append(next_state[2])
        agent.actual_returns_history.append(reward)

        if next_state is not None:
            agent.next_states.append(next_state)
        else:
            agent.next_states.append(0)

        state = next_state

    testing_rmse = agent.calculate_testing_rmse()

    backtest_result = {
        'action_history': env.action_history,
        'testing_rmse': testing_rmse,
        'testing_positions': test_positions,
        'testing_z_score': test_z_score
    }

    return backtest_result

In [27]:
from scipy.optimize import newton

def calculate_yield(price, coupon, maturity_years=10, frequency=2, face_value=100):
    """Calculate yield using Newton's method - handles single values"""
    if isinstance(coupon, pd.Series):
        coupon = float(coupon.iloc[0])  # Take first value if Series
        
    def bond_price_diff(ytm):
        """Calculate difference between actual price and theoretical price at given YTM"""
        periods = int(maturity_years * frequency)
        coupon_payment = (coupon/100/frequency) * face_value  # Convert coupon to decimal
        
        # Sum of PV of all coupon payments
        pv_coupons = sum(coupon_payment / (1 + ytm/frequency)**(t+1) 
                        for t in range(periods))
        
        # PV of principal
        pv_principal = face_value / (1 + ytm/frequency)**periods
        
        return pv_coupons + pv_principal - price
    
    try:

        initial_guess = (coupon/100) * (face_value/price)
        
        ytm = newton(bond_price_diff, initial_guess, 
                    tol=1e-7, maxiter=100)
        return ytm
        
    except:
        # Fallback if solver fails
        print(f"Solver failed for price={price}, coupon={coupon}")
        return (coupon/100) 

def calculate_price(ytm, coupon, maturity_years=10, frequency=2, face_value=100):
    """Calculate bond price given yield - handles single values"""
    if isinstance(coupon, pd.Series):
        coupon = float(coupon.iloc[0])  # Take first value if Series
        
    periods = int(maturity_years * frequency)
    coupon_payment = coupon/frequency
    pv = sum(coupon_payment / (1 + ytm/frequency)**(t+1) for t in range(periods))
    
    pv += face_value / (1 + ytm/frequency)**periods
    return pv

In [28]:

def generate_and_concatenate_shocked_returns_bonds(df, asset1, asset2, shock_bps, maturity_years=10):
    asset1 = asset1.replace(' Adjusted','')
    asset2 = asset2.replace(' Adjusted','')
    bonds = [asset1, asset2]
    
    # Create price and coupon DataFrames for selected bonds
    price_data = pd.DataFrame()
    coupon_data = pd.DataFrame()
    
    for bond in bonds:
        price_col = f"{bond} Adjusted"
        coupon_col = f"{bond} CPN"
        price_data[bond] = df[price_col]
        coupon_data[bond] = df[coupon_col]
    
    # Calculate original returns
    original_returns = price_data.pct_change()
    original_returns.columns = [f"{col} Adjusted Return" for col in original_returns.columns]
    
    # List to store all return series
    all_returns = [original_returns]
    
    # Generate date ranges for shocked series
    date_increment = pd.Timedelta(days=1)
    base_dates = df.index
    current_start = base_dates[-1] + date_increment
    
    # Convert basis points to decimal
    shock_levels = [x/10000 for x in shock_bps]
    
    # Generate shocked returns with extended dates
    for i, shock in enumerate(shock_levels):
        # Create new dates for this shock period
        shock_dates = pd.date_range(
            start=current_start,
            periods=len(base_dates),
            freq=pd.infer_freq(base_dates)
        )
        current_start = shock_dates[-1] + date_increment

        shocked_prices = pd.DataFrame(index=shock_dates)
        
        for bond in bonds:
            # Calculate yields and shocked prices
            yields = price_data[bond].apply(
                lambda x: calculate_yield(float(x), float(coupon_data[bond].iloc[0]), maturity_years))

            shocked_yields = yields + shock
            
            new_prices = shocked_yields.apply(
                lambda x: calculate_price(float(x), float(coupon_data[bond].iloc[0]), maturity_years))
            
            shocked_prices[f"{bond} Adjusted Return"] = pd.Series(
                data=new_prices.values,
                index=shock_dates
            )
            
        # Calculate returns and add data_type
        shocked_returns = shocked_prices.pct_change()
        
        # Store shocked returns
        all_returns.append(shocked_returns)
        
        print(f"Shock {int(shock*10000)}bps - Length: {len(shocked_returns)}")
    
    # Concatenate all return series
    concatenated_returns = pd.concat(all_returns)
    concatenated_returns = concatenated_returns.dropna()
    
    
    return concatenated_returns

In [29]:
def generate_and_concatenate_shocked_returns(original_price, vol_shocks=[0.5, 0.75, 1.0]):
    original_dates = original_price.index
    original_returns = original_price.pct_change()

    original_cols = original_returns.columns
    renamed_cols = [f"{col} Return" for col in original_cols]
    original_returns.columns = renamed_cols

    # List to store all return series
    all_returns = [original_returns]
    std_prices = original_price.std()
    last_date = original_dates[-1]
    
    for i, vol_shock in enumerate(vol_shocks, 1):
        # Create shocked prices with extended dates
        new_dates = pd.date_range(start=last_date + pd.Timedelta(days=1),
                                periods=len(original_dates),
                                freq=pd.infer_freq(original_dates))
        
        shocked_prices = pd.DataFrame(index=new_dates)
        for col in original_price.columns:
            shocked_prices[f"{col} Return"] = original_price[col].values + vol_shock * std_prices[col]
        
        shocked_returns = shocked_prices.pct_change()
        all_returns.append(shocked_returns)
        last_date = new_dates[-1]
    concatenated_returns = pd.concat(all_returns)
    concatenated_returns = concatenated_returns.dropna()
    
    return concatenated_returns

def train_single_pair_dqn_concatenated(concatenated_returns, asset1, asset2, window_size, upper_threshold, 
                                     close_threshold, beta, hidden_size, episodes=70):
    """
    Train agent on concatenated returns data
    """
    
    # Create environment with concatenated returns
    env = SinglePairTradingEnv(concatenated_returns, asset1, asset2, 
                              upper_threshold, close_threshold, window_size, beta)
    
    
    # Initialize agent and metrics
    state_size = 3
    action_size = 3  # -1, 0, 1
    agent = DQNAgent(state_size, action_size, hidden_size)
    
    training_metrics = {
        'best_model_positions': [],
        'best_model_z_score': [],
        'best_portfolio_value': float('-inf'),
        'best_model_rmse': float('inf'),
        'best_reward': float('-inf'),
        'best_cumulative_return': [],
        'best_state_dict': None,
        'training_rmse_history': [],
        'reward_history': []
    }

    for episode in range(episodes):
        state = env.reset()
        episode_reward = 0
        done = False
        episode_rmse = []
        current_positions = []
        current_z_scores = []
        
        while not done:
            action = agent.act(state)
            next_state, reward, done, info = env.step(action)
            
            # Store experience
            agent.memory.push(state, action, reward, next_state, done)
            
            # Learn if enough samples
            if len(agent.memory) > agent.batch_size:
                rmse = agent.learn()
                if rmse is not None:
                    episode_rmse.append(rmse)
            
            episode_reward += reward
            current_positions.append(next_state[0])
            current_z_scores.append(next_state[2])
            
            state = next_state
        if episode % 10 ==0:
            print('episode',episode)
        
        # Store episode metrics
        training_metrics['reward_history'].append(episode_reward)
        avg_rmse = np.mean(episode_rmse) if episode_rmse else float('inf')
        training_metrics['training_rmse_history'].append(avg_rmse)
        
        # Update best model if performance improves
        # if avg_rmse < training_metrics['best_model_rmse']:
        if episode_reward > training_metrics['best_reward']:
            training_metrics['best_reward'] = episode_reward
            training_metrics['best_state_dict'] = agent.policy_net.state_dict()
            training_metrics['best_model_rmse'] = avg_rmse
            training_metrics['best_model_positions'] = current_positions
            training_metrics['best_model_z_score'] = current_z_scores
            training_metrics['best_cumulative_return'] = env.cumulative_return
        
        agent.decay_epsilon()
        
    
    # Set final network to best found
    agent.policy_net.load_state_dict(training_metrics['best_state_dict'])
    agent.target_net.load_state_dict(training_metrics['best_state_dict'])
    
    return agent, training_metrics

In [30]:
def plot_cumulative_returns(dates, training_returns, type):
    plt.figure(figsize=(12, 6))
    plt.plot(dates, training_returns, label='Returns')
    plt.title(f'Cumulative Returns in {type} set')
    plt.xlabel('Dates')
    plt.ylabel('Cumulative Return')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()


def plot_comparative_trading_behavior(dates, action_results, rule_based_result, upper_threshold, data_type, hidden_size, window_size):

    hidden_size_str = '-'.join(str(layer) for layer in hidden_size)
    if data_type == 'Testing':
        dqn_positions = action_results['testing_positions']
        z_score_history = action_results['testing_z_score']

    elif data_type == 'Training':
        dqn_positions = action_results['best_model_positions']
        z_score_history = action_results['best_model_z_score']

    rule_positions = rule_based_result['positions'][window_size+1:]

    fig = plt.figure(figsize=(20, 15))
    gs = plt.GridSpec(2, 1, height_ratios=[3, 2])

    # First subplot: DQN strategy
    ax1 = fig.add_subplot(gs[0])
    ax1.plot(dates, z_score_history, color='blue', linewidth=0.8, label='Spread')
    ax1.plot(dates, [-upper_threshold] * len(z_score_history), 'g--', linewidth=0.8, label='Buy Threshold')
    ax1.plot(dates, [upper_threshold] * len(z_score_history), 'r--', linewidth=0.8, label='Sell Threshold')
    
    # Plot DQN signals
    buy_dates = dates[dqn_positions == 1]
    sell_dates = dates[dqn_positions == -1]
    
    for i in range(len(dates)):
        if dqn_positions[i] == 1:
            ax1.scatter(dates[i], z_score_history[i], color='green', marker='^', s=50)
        elif dqn_positions[i] == -1:
            ax1.scatter(dates[i], z_score_history[i], color='red', marker='v', s=50)
    
    ax1.set_title(f'DQN Trading Signals - {data_type} for {hidden_size_str}')
    ax1.legend(loc='upper right')
    ax1.grid(True, alpha=0.3)
    
    # Rule-based Strategy subplot
    ax2 = fig.add_subplot(gs[1], sharex=ax1)
    ax2.plot(dates, z_score_history, color='blue', linewidth=0.8, label='Spread')
    ax2.plot(dates, [-upper_threshold] * len(z_score_history), 'g--', linewidth=0.8, label='Buy Threshold')
    ax2.plot(dates, [upper_threshold] * len(z_score_history), 'r--', linewidth=0.8, label='Sell Threshold')
    
    # Plot rule-based signals
    for i in range(len(dates)):
        if rule_positions[i] == 1:
            ax2.scatter(dates[i], z_score_history[i], color='green', marker='^', s=50)
        elif rule_positions[i] == -1:
            ax2.scatter(dates[i], z_score_history[i], color='red', marker='v', s=50)
    
    ax2.set_title(f'Rule-based Trading Signals {data_type}-{hidden_size}')
    ax2.legend(loc='upper right')
    ax2.grid(True, alpha=0.3)
    
    
    # # Create difference highlights
    difference_indices = [i for i in range(len(dates)) if rule_positions[i] != dqn_positions[i]]
    difference_dates = [dates[i] for i in difference_indices]
    difference_scores = [z_score_history[i] for i in difference_indices]
    

    
    # Print summary statistics
    total_differences = len(difference_indices)
    total_periods = len(dates)
    percentage_difference = total_differences/total_periods

    return total_differences, percentage_difference
    


def get_performance_metrics(dates, action_results, upper_threshold, rule_based_results, data_type, hidden_size, window_size):
    if data_type == 'Testing':
        action_history = action_results['action_history']
        rmse = action_results['testing_rmse']     
        last_action = action_history[-1]
        cumulative_return = last_action['cumulative_return']
    elif data_type == 'Training':
        rmse = action_results['best_model_rmse']
        cumulative_return = action_results['best_cumulative_return']

    # Plot performance visualizations
    total_differences, percentage_difference = plot_comparative_trading_behavior(dates, action_results, rule_based_results, upper_threshold, data_type, hidden_size, window_size)
    
    # Calculate performance metrics
    mean_return, sharpe = get_summary(cumulative_return)
    rule_based_cum_return = rule_based_results['cum_return']
    rule_based_mean_return, rule_based_sharpe = get_summary(rule_based_cum_return)
    
    dqn_strategy_summary = {
        'Data Type': data_type,
        # 'Mean Return (%) Annual': "{:.4f}".format(mean_return),
        'Sharpe Ratio Annual': "{:.4f}".format(sharpe),
        'RMSE': "{:.4f}".format(rmse),
        'Hidden Size':'-'.join(str(layer) for layer in hidden_size),
        'Total Difference': total_differences,
        'Percentage Difference': "{:.2f}%".format(percentage_difference * 100)
    }

    rule_based_summary = {
        'Data Type': f'Naive Strat -{data_type}',
        # 'Mean Return (%) Annual': "{:.4f}".format(rule_based_mean_return),
        'Sharpe Ratio Annual': "{:.4f}".format(rule_based_sharpe),
        'RMSE': 0,
        'Hidden Size':'-'.join(str(layer) for layer in hidden_size),
        'Total Difference': 0,
        'Percentage Difference': 0
    }
    return dqn_strategy_summary, rule_based_summary

def get_summary(cumulative_return):
    periodic_returns = (1 + np.array(cumulative_return[1:])) / (1 + np.array(cumulative_return[:-1])) - 1
    periodic_returns = periodic_returns[~np.isinf(periodic_returns) & ~np.isneginf(periodic_returns)]
    periodic_returns = periodic_returns[~np.isnan(periodic_returns)]
    
    # Mean return
    mean_return = np.mean(periodic_returns) * 252 * 100
    
    # Sharpe ratio (assuming risk-free rate = 0)
    sharpe = (np.mean(periodic_returns) / np.std(periodic_returns)) * np.sqrt(252)
    
    return mean_return, sharpe

In [0]:
if __name__ == "__main__":
    # Setup parameters
    start_year = 2014
    end_year = 2024
    summary_data = []
    
    training_testing_Split = '2018-12-31'
    train_period = tr_dm_net[tr_dm_net.index <= training_testing_Split]
    test_period = tr_dm_net[tr_dm_net.index > training_testing_Split]
    train_period_price = tr_dm_net_price[tr_dm_net_price.index <= training_testing_Split]
    test_period_price = tr_dm_net_price[tr_dm_net_price.index > training_testing_Split]
    train_tr_dm_for_shock = tr_dm_for_shock[tr_dm_for_shock.index <= training_testing_Split]

    asset1 = 'GTSEK10Y Govt Adjusted'
    asset2 = 'GTCHF10Y Govt Adjusted'
    # concatenated_returns = generate_and_concatenate_shocked_returns(train_period_price)
    concatenated_returns = generate_and_concatenate_shocked_returns_bonds(train_tr_dm_for_shock, asset1, asset2, shock_bps=[100, 200, -100])
    window_size = 60
    upper_threshold = 1.0
    close_threshold = 0.1
    beta = 0.8

    rule_based_train_result = rule_based_strategy(concatenated_returns, window_size, upper_threshold,close_threshold, asset1,asset2)
    rule_based_test_result = rule_based_strategy(test_period, window_size, upper_threshold,close_threshold, asset1,asset2)
    # Define network architectures
    hidden_size_list = [[64,32],[32,16],[5,2],[128,128]]
    
    for hidden_size in hidden_size_list:
        print(f"\nTraining with hidden size: {hidden_size}")
        
        # Train with synthetic data
        best_agent, train_metrics = train_single_pair_dqn_concatenated(
            concatenated_returns,
            asset1=asset1,
            asset2=asset2,
            window_size=window_size,
            upper_threshold=upper_threshold,
            close_threshold=close_threshold,
            beta=beta,
            hidden_size=hidden_size,
            episodes=150
        )

        # Training dates
        training_dates = concatenated_returns.index[window_size+1:]

        train_dqn_strategy_summary, rule_based_summary_train = get_performance_metrics(training_dates, train_metrics, upper_threshold, rule_based_train_result, 'Training', hidden_size, window_size)
        
        rule_based_summary_train['Pair'] = f"{asset1} - {asset2}"
        train_dqn_strategy_summary['Pair'] = f"{asset1} - {asset2}"

        rule_based_summary_train['Beta'] = beta
        train_dqn_strategy_summary['Beta'] = beta

        summary_data.append(rule_based_summary_train)
        summary_data.append(train_dqn_strategy_summary)



        # Test best agent
        env_test = SinglePairTradingEnv(test_period, asset1, asset2, upper_threshold, close_threshold, window_size, beta)
        
        testing_dates = test_period.index[window_size+1:]
        backtest_results = backtest(env_test, best_agent)
        
        # Get and store testing metrics
        test_dqn_strategy_summary, rule_based_summary_test = get_performance_metrics(testing_dates, backtest_results, upper_threshold, rule_based_test_result, 'Testing', hidden_size, window_size)
        rule_based_summary_test['Pair'] = f"{asset1} - {asset2}"
        test_dqn_strategy_summary['Pair'] = f"{asset1} - {asset2}"
        rule_based_summary_test['Beta'] = beta
        test_dqn_strategy_summary['Beta'] = beta
        summary_data.append(test_dqn_strategy_summary)
        summary_data.append(rule_based_summary_test)
    
    # Create summary DataFrame
    summary_df = pd.DataFrame(summary_data)
    columns = ['Pair','Data Type','Hidden Size','Beta', 'Sharpe Ratio Annual', 'RMSE','Total Difference','Percentage Difference']
    summary_df = summary_df[columns]

Shock 100bps - Length: 1182
Shock 200bps - Length: 1182
Shock -100bps - Length: 1182

Training with hidden size: [64, 32]
episode 0
episode 10
episode 20
episode 30
episode 40
episode 50
episode 60


In [None]:

pivot_df = pd.pivot_table(
   summary_df,
   index=['Hidden Size', 'Pair','Beta'],
   columns='Data Type',
   values=['Sharpe Ratio Annual','RMSE','Total Difference','Percentage Difference'],
   aggfunc='first' 
).round(4)
pivot_df.head(60)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Percentage Difference,Percentage Difference,Percentage Difference,Percentage Difference,RMSE,RMSE,RMSE,RMSE,Sharpe Ratio Annual,Sharpe Ratio Annual,Sharpe Ratio Annual,Sharpe Ratio Annual,Total Difference,Total Difference,Total Difference,Total Difference
Unnamed: 0_level_1,Unnamed: 1_level_1,Data Type,Naive Strat -Testing,Naive Strat -Training,Testing,Training,Naive Strat -Testing,Naive Strat -Training,Testing,Training,Naive Strat -Testing,Naive Strat -Training,Testing,Training,Naive Strat -Testing,Naive Strat -Training,Testing,Training
Hidden Size,Pair,Beta,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
128-128,GTSEK10Y Govt Adjusted - GTDEM10Y Govt Adjusted,0.8,0,0,40.36%,65.22%,0,0,0.0104,0.0765,0.3206,0.4253,-1.0631,0.5228,0,0,576,3041
32-16,GTSEK10Y Govt Adjusted - GTDEM10Y Govt Adjusted,0.8,0,0,79.19%,68.56%,0,0,0.1499,0.1008,0.3206,0.4253,0.2704,0.4992,0,0,1130,3197
5-2,GTSEK10Y Govt Adjusted - GTDEM10Y Govt Adjusted,0.8,0,0,80.66%,68.30%,0,0,0.0109,0.1268,0.3206,0.4253,0.6487,0.4577,0,0,1151,3185
64-32,GTSEK10Y Govt Adjusted - GTDEM10Y Govt Adjusted,0.8,0,0,79.68%,69.91%,0,0,0.0269,0.1228,0.3206,0.4253,0.2222,0.3013,0,0,1137,3260


# Sheet 2

## Appendix: Coherent pair and noncoherent pair comparison

In [None]:
# Training/testing data split
training_testing_Split = '2018-12-31'
training_return = tr_dm_net[tr_dm_net.index <= training_testing_Split]
testing_return = tr_dm_net[tr_dm_net.index > training_testing_Split]

training_price = tr_dm_net_price[tr_dm_net_price.index <= training_testing_Split]
testing_price = tr_dm_net_price[tr_dm_net_price.index > training_testing_Split]


In [None]:

def evaluate_pair_performance(pairs_list, data, window_size, upper_threshold, close_threshold):
    pairs_metrics = {}
    
    for asset1, asset2 in pairs_list:

        z_score, spread, returns, cumulative_return, positions = rule_based_strategy_info(data, window_size, upper_threshold, close_threshold, asset1, asset2)

        metrics = {
            'Cumulative Return': cumulative_return[-1],
            # 'Max Drawdown': calculate_max_drawdown(returns),
            'Spread Volatility': spread.std(),
            'signal_count': len(z_score[abs(z_score) > upper_threshold]),
            'avg_deviation': abs(z_score).mean(),
            'max_deviation': abs(z_score).max(),
            'signal_frequency': len(z_score[abs(z_score) > upper_threshold]) / len(z_score),
            'Average Trade Return': np.mean(returns[returns != 0])
        }

        pairs_metrics[(asset1.replace("Adjusted Return",""), asset2.replace("Adjusted Return",""))] = metrics
            
    return pairs_metrics

def compare_coherent_noncoherent(coherent_pair, noncoherent_pair, data, 
                                 window_size_coherent, upper_threshold_coherent, close_threshold_coherent,
                                 window_size_noncoherent, upper_threshold_noncoherent, close_threshold_noncoherent):

    coherent_metrics = evaluate_pair_performance(coherent_pair, data, 
                                               window_size_coherent, upper_threshold_coherent, close_threshold_coherent)
    
    noncoherent_metrics = evaluate_pair_performance(noncoherent_pair, data, 
                                                window_size_noncoherent, upper_threshold_noncoherent, close_threshold_noncoherent)
    
    
    return coherent_metrics, noncoherent_metrics


In [None]:
window_size_list = [30, 60, 90]
upper_threshold_list = [1.0, 1.25, 1.5]
close_threshold_list = [0.1, 0.5, 1]

coherent_pair, noncoherent_pair = noncoherent_pair_cluster(training_return, training_price)
# upper_threshold_coherent=upper_threshold_noncoherent = 1.0
# close_threshold_coherent = close_threshold_noncoherent = 0.1
# window_size_coherent = window_size_noncoherent = 60


upper_threshold_coherent, close_threshold_coherent, window_size_coherent = optimize_parameters(training_return, coherent_pair, window_size_list, upper_threshold_list, close_threshold_list)
upper_threshold_noncoherent, close_threshold_noncoherent, window_size_noncoherent = optimize_parameters(training_return, noncoherent_pair, window_size_list, upper_threshold_list, close_threshold_list)

coherent_metrics, noncoherent_metrics= compare_coherent_noncoherent(coherent_pair, noncoherent_pair,training_return, 
                                window_size_coherent, upper_threshold_coherent, close_threshold_coherent,
                                window_size_noncoherent, upper_threshold_noncoherent, close_threshold_noncoherent)

test_coherent_metrics, test_noncoherent_metrics= compare_coherent_noncoherent(coherent_pair, noncoherent_pair,testing_return, 
                                window_size_coherent, upper_threshold_coherent, close_threshold_coherent,
                                window_size_noncoherent, upper_threshold_noncoherent, close_threshold_noncoherent)


coherent_df = pd.DataFrame(coherent_metrics).round(4).T
print("\n" + "="*80)
print("TRAIN COHERENT PAIRS PERFORMANCE".center(80))
print("="*80)
print(coherent_df.to_string())

noncoherent_df = pd.DataFrame(noncoherent_metrics).round(4).T
print("\n" + "="*80)
print("TRAIN NONCOHERENT PAIRS PERFORMANCE".center(80))
print("="*80)
print(noncoherent_df.to_string())


test_coherent_df = pd.DataFrame(test_coherent_metrics).round(4).T
print("\n" + "="*80)
print("TEST COHERENT PAIRS PERFORMANCE".center(80))
print("="*80)
print(test_coherent_df.to_string())

test_noncoherent_df = pd.DataFrame(test_noncoherent_metrics).round(4).T
print("\n" + "="*80)
print("TEST NONCOHERENT PAIRS PERFORMANCE".center(80))
print("="*80)
print(test_noncoherent_df.to_string())

KeyError: KeyError: 'cum_returns'

# Sheet 3

In [None]:
def add_yield_shock_to_bonds(price_data, coupon_rate, maturity, shock_bps=100):
    """
    Add yield shock to historical bond prices
    
    Parameters:
    price_data: Series/array of historical bond prices
    coupon_rate: Annual coupon rate (decimal)
    maturity: Years to maturity
    shock_bps: Shock in basis points to add to yield
    """
    def solve_for_ytm(price, guess=0.05):
        """Find yield to maturity using Newton's method"""
        def npv(y):
            periods = int(maturity * 2)  # Semi-annual payments
            coupon = (coupon_rate/2) * 1000  # Semi-annual coupon payment
            r = y/2  # Semi-annual yield
            
            # Calculate present value
            pv_coupons = coupon * (1 - (1+r)**(-periods))/r
            pv_principal = 1000/(1+r)**periods
            return pv_coupons + pv_principal - price
            
        def npv_prime(y):
            # Numerical approximation of derivative
            delta = 1e-5
            return (npv(y + delta) - npv(y))/delta
            
        # Newton's method iteration
        y = guess
        for _ in range(100):
            diff = npv(y)/npv_prime(y)
            y -= diff
            if abs(diff) < 1e-7:
                break
        return y

    shocked_prices = []
    for price in price_data:
        # Calculate original YTM
        ytm = solve_for_ytm(price)
        
        # Add shock
        shocked_ytm = ytm + (shock_bps/10000)
        
        # Calculate new price
        periods = int(maturity * 2)
        r = shocked_ytm/2
        coupon = (coupon_rate/2) * 1000
        
        pv_coupons = coupon * (1 - (1+r)**(-periods))/r
        pv_principal = 1000/(1+r)**periods
        shocked_price = pv_coupons + pv_principal
        
        shocked_prices.append(shocked_price)
    
    return pd.Series(shocked_prices, index=price_data.index)

# Example usage:
"""
historical_prices = pd.Series([...])  # Your historical price data
coupon_rate = 0.04  # 4% annual coupon
maturity = 10  # 10-year bond

shocked_prices = add_yield_shock_to_bonds(
    historical_prices, 
    coupon_rate=coupon_rate,
    maturity=maturity,
    shock_bps=100
)
"""

# Sheet 4


Deep Q-Network for Statistical Arbitrage: A Reinforcement Learning Approach to Pair Trading

Abstract
This paper presents a novel approach to statistical arbitrage by implementing a Deep Q-Network (DQN) reinforcement learning algorithm for pair trading. Through the application of deep reinforcement learning techniques, we demonstrate how DQN can learn optimal trading decisions by considering both traditional statistical measures and dynamic market conditions. Our empirical results show that this approach achieves superior performance compared to conventional threshold-based methodologies, particularly in its ability to adapt to changing market conditions and manage risk effectively.

1. Introduction
1.1 Background
Statistical arbitrage has long been a cornerstone strategy in quantitative trading, with pair trading representing one of its most well-known implementations. Traditional pair trading approaches rely on predetermined statistical thresholds to trigger trading signals, assuming that the spread between two cointegrated securities will maintain historical patterns. However, these conventional methods often fail to adapt to evolving market dynamics and may miss opportunities or incur losses when historical relationships temporarily break down. 
Reinforcement learning (RL) offers a natural solution to this limitation by learning optimal trading policies through direct interaction with the market environment. Unlike traditional methods, RL agents can generalize patterns and recognize subtle shifts in market regimes that might not be captured by simple statistical thresholds. Furthermore, RL's ability to optimize for long-term rewards rather than immediate profits allows it to develop more sophisticated trading strategies that balance risk and return across different market conditions. The adaptive nature of RL makes it particularly well-suited for pair trading, where the relationship between securities can evolve over time and traditional static rules may become suboptimal.

1.2 Research Objectives
This research introduces a novel framework that leverages deep Q-learning to overcome the limitations of traditional pair trading approaches. By implementing a Deep Q-Network architecture, we aim to develop a trading system that can dynamically adjust its strategy based on market conditions while maintaining the fundamental principles of statistical arbitrage. Our approach moves beyond simple threshold-based rules to incorporate a broader range of market indicators and position management considerations.

2. Methodology
2.1 Pair Selection Process
The selection of proper trading pairs follows a three-step hierarchical approach that systematically identifies pairs with stable and tradeable relationships. This methodology combines principal component analysis, cointegration testing, and factor sensitivity analysis to ensure robust pair identification.
The first step involves clustering bonds based on their loadings on the first principal component (PC1), which typically represents the dominant market-wide interest rate risk factor. The economic intuition behind this initial clustering is that PC1 captures the parallel shift in yield curves, also known as the level factor. By grouping bonds with similar PC1 loadings, we identify securities that respond similarly to broad market movements. The process begins with standardized returns to ensure the clustering is based on correlation patterns rather than being dominated by volatility differences. The optimal number of clusters is determined through the elbow method, providing a data-driven approach to group formation.
Following the initial clustering, the second step examines cointegration relationships among pairs within each identified cluster. Cointegration testing is fundamental as it validates the statistical basis for mean-reversion trading strategies. This step is particularly crucial because it identifies pairs whose price spreads tend to maintain a long-term mean-reverting relationship despite short-term deviations. By testing for cointegration within clusters rather than across the entire universe, we focus on economically meaningful relationships while maintaining computational efficiency. The use of price levels rather than returns in this analysis helps capture persistent long-term relationships between securities.
The third step introduces a refined analysis of secondary risk factors through examination of PC2. In our project, PC2 captures geographical proximity and regional economic linkages between sovereign bonds. This geographical interpretation of PC2 is particularly meaningful for our dataset of international sovereign bonds, as the pairs of bonds with similar PC2 are subject to similar regional macroeconomic conditions, monetary policies, and risk factors. For instance, bonds from countries within the Eurozone or Nordic regions tend to exhibit similar PC2 loadings, reflecting their economic interconnectedness and shared policy environments. The threshold for acceptable PC2 loading differences is set at the median of observed differences, providing a balanced approach to pair selection.
To further investigate the properties of pairs of bonds with similar PC2 and those with divergent behavior on the second principal component, we compared these two groups from the aspects of 

The strength of this approach lies in its comprehensive consideration of both statistical and economic factors. By combining principal component analysis with cointegration testing, the methodology captures both the cross-sectional and time-series aspects of bond market relationships. The addition of secondary factor analysis provides an extra layer of scrutiny, helping to identify pairs that are likely to maintain their relationship across different market conditions. This thorough filtering process results in a select group of pairs that are well-suited for statistical arbitrage strategies, with reduced risk of relationship breakdown during periods of market stress.

2.2 Rule-Based Statistical Arbitrage
The rule-based statistical arbitrage strategy implements a systematic approach to pair trading through several key steps, incorporating parameter optimization through grid search to enhance strategy performance.
First, the strategy employs rolling Principal Component Analysis (PCA) to identify the common trend between the two assets. The PCA loadings are calculated on a rolling basis to avoid look-ahead bias over time. The strategy focuses on the loadings for the second principal component (PC2), which reflects the bond-specific factors that are relatively insensitive (orthogonal) to broader market movements captured by PC1. The code extracts the rolling loadings of the selected assets on PC2 and uses these loadings to construct the spread.
The spread is calculated as a long/short linear combination of returns from two assets: Spread = w₁ * Asset₁ Return - w₂ * Asset₂ Return, where the weights w₁ is normalized to 1 and the second weight, w₂, is determined through a formula that incorporates ratio of rolling volatilities and the ratio of 2nd principal component loading: w₂ = w₁ * (σ₁/σ₂) * (PC 2 Loading₁/PC2 Loading₂).

Trading signals are generated using a z-score approach that measures the deviation of the spread from its rolling mean in units of rolling standard deviation. The critical parameters that govern signal generation are determined through an exhaustive grid search process that tests multiple combinations of three key parameters: the lookback window size for calculating rolling statistics, the entry thresholds for position initiation, and the exit thresholds for position closure. Each parameter combination is evaluated across all trading pairs to identify the set that maximizes average cumulative returns while maintaining strategy robustness.
Position management follows optimized rules based on the grid search results:
1. When the z-score exceeds the optimized upper threshold, indicating the spread is significantly positive, we take a short position (-1) expecting mean reversion.
2. When the z-score falls below the optimized lower threshold, indicating the spread is significantly negative, we take a long position (1).
3. Positions are closed when the z-score reverts to within the optimized close_threshold bands, capturing the mean reversion profit.
The strategy ensures logical parameter relationships are maintained, such as requiring the close threshold to be smaller than the entry threshold, which helps prevent illogical trading scenarios. The optimization process evaluates the performance across all pairs to find parameters that work well universally rather than being overfitted to specific pairs.
The strategy calculates returns by combining the positions with the actual asset returns, considering the position sizes determined by w1 and w2. A one-period lag is applied to the positions to reflect realistic trading implementation where signals from one period generate trades for the next period. The cumulative performance is tracked through the cumulative product of the returns, providing a measure of the strategy's effectiveness over time.


2.2 Reinforcement Learning Strategy
 
The reinforcement learning based statistical arbitrage strategy leverages off-policy Deep Q-learning model to discover and exploit trading opportunities in pairs of cointegrated securities. Unlike traditional rule-based approaches that rely on fixed thresholds, the Deep Q-Network (DQN) learns to recognize complex patterns in the spread dynamics and adapts its trading decisions based on both historical relationships and current market conditions.

The DQN agent learns optimal trading policies through direct interaction with the market environment, where it sequentially decides whether to take long, short, or neutral positions in the spread. Through the process of exploration and exploitation, the agent discovers which actions maximize cumulative rewards across different market regimes. The neural network architecture enables the agent to capture non-linear relationships and subtle patterns in the spread behavior that may not be apparent through simple statistical measures.

The off-policy nature of DQN allows the agent to learn from historical data while maintaining the ability to explore new strategies. By using experience replay, the agent can efficiently learn from past trading decisions and their outcomes, breaking the temporal correlation in the data and improving the stability of learning. Furthermore, the implementation of a target network helps prevent overoptimistic value estimates and provides more stable learning targets.

A key advantage of this approach is its ability to learn state-dependent trading thresholds that adapt to varying market conditions, rather than relying on fixed statistical thresholds. The agent learns to recognize not just the magnitude of spread deviations, but also the broader market context in which these deviations occur, potentially leading to more nuanced and profitable trading decisions. This adaptability is particularly valuable in markets where relationships between securities can evolve over time or break down during periods of market stress.

2.2.1 Data Preparation
The data preparation process for our statistical arbitrage strategy begins with the collection of 10-year sovereign bond price series from major developed markets including Canada, United States, Germany, Australia, New Zealand, Switzerland, Japan and Sweden. These data are sourced from Bloomberg spanning from 2014 to 2024. To ensure data quality and continuity, we carefully process the raw price series by removing artificial price jumps that occur during bond rolling periods, as these discontinuities could mislead our trading signals and model training.

The dataset is then chronologically divided into two periods: a training set covering 2014-2018 and a testing set from 2019-2024. This temporal split allows us to evaluate the model's performance on truly out-of-sample data. However, one persistent challenge in applying reinforcement learning to financial markets is the limited availability of historical data, particularly when attempting to capture various market regimes and conditions. To address this limitation, we implement a novel data augmentation approach to extends the training dataset by applying calibrated volatility shocks to the original price series, creating synthetic but realistic price scenarios. Specifically, we generate three additional price series by applying shocks of 50%, 75%, and 100% of the historical volatility to the original prices, then calculate returns for each shocked series. These synthetic data points are appended sequentially after the original training period, maintaining the temporal structure of the data while significantly expanding the variety of market scenarios available for model training. This approach helps the reinforcement learning agent develop more robust trading strategies by exposing it to a broader range of potential market conditions while preserving the underlying statistical properties and relationships between different sovereign bonds.

2.2.2 State Space Design
The state space is constructed to provide the DQN agent with comprehensive market information while maintaining dimensionality at a manageable level. Our five-dimensional state representation includes:
	1. Spread:  a long/short linear combination of returns from two assets: Spread = w₁ * Asset₁ Return - w₂ * Asset₂ Return, where the weights w₁ is normalized to 1 and the second weight, w₂, is determined through a formula that incorporates ratio of rolling volatilities and the ratio of 2nd principal component loading: w₂ = w₁ * (σ₁/σ₂) * (PC 2 Loading₁/PC2 Loading₂)
	2. Position state: A normalized value in {-1, 0, 1} representing short, neutral, or long positions in the spread.
	3.  Z-score: Calculated using a rolling window to capture the relative deviation from historical spread patterns:
   Zt = (St - μt)/σt
   with a 30-day rolling window for μt and σt.

2.2.3 Reward Function Design
Our reward function incorporates incorporating both profitability objectives and behavioral constraints to align the agent's learning with the objectives of statistical arbitrage:
R = Rt + α|agent action - baseline strategy action|
where:
- Rt represents the immediate trading return
- |agent action - baseline strategy action| measures the deviation from agent action decision with rule-based baseline strategy
- α is position-based penalty scaling factor, which controls the balance between pure profit-seeking behavior and adherence to established trading principles

The immediate return component Rt is calculated as:
Rt = pt * (St+1 - St)
where pt is the position held and (St+1 - St) represents the spread change.
By anchoring the agent's behavior to a proven rule-based approach, we reduce the risk of the agent discovering spurious patterns in the training data or developing strategies that might be theoretically profitable but practically unfeasible.

2.4 Network Architecture and Training
2.4.1 Double DQN Implementation
We implement a Double DQN architecture to address the overestimation bias inherent in standard Q-learning. The value function approximation uses two networks: a policy network for action selection and a target network for value estimation. The target network parameters θ' are updated using a soft update mechanism:
θ' = τθ + (1-τ)θ'
where τ is the soft update parameter set to 0.001, and θ represents the policy network parameters.
2.4.2 Training Process
The training process utilizes experience replay with a memory buffer of 10,000 state-action-reward transitions. Each training iteration samples a batch of 32 experiences randomly from this buffer. The temporal difference (TD) error is computed as:
δt = rt + γQ(st+1,argmax_a Q(st+1,a;θ);θ') - Q(st,at;θ)
where γ is the discount factor set to 0.95, and Q represents the action-value function.
2.4.3 Learning Rate Adaptation
We implement a dynamic learning rate schedule using the Adam optimizer with an initial learning rate of 1e-3. The learning rate is adjusted based on the loss convergence:
αt = α0 * λ^(t/T)
where α0 is the initial learning rate, λ is the decay factor (0.995), and T is the update frequency.
2.5 Risk Management Framework
Our risk management framework incorporates position sizing and stop-loss mechanisms directly into the DQN framework. Position sizes are dynamically adjusted based on the predicted Q-values and historical volatility:
PS = f(Q(s,a)) * g(σt)
where f() maps Q-values to position sizes and g() adjusts for volatility. Stop-loss thresholds are implemented as:
SL = max(k * σt, fixed_threshold)
where k is a multiplier determined through cross-validation, and σt is the rolling volatility.
2.6 Performance Metrics
We evaluate the model's performance using both prediction accuracy and trading metrics. The primary prediction metric is the RMSE between predicted Q-values and realized returns:
RMSE = √(1/N ∑(Q_predicted - R_realized)²)
Trading performance is assessed through:
1. Sharpe Ratio computed using daily returns
2. Maximum Drawdown measured over rolling windows
3. Information Ratio relative to a baseline pair trading strategy
4. Trade success rate and profit per trade statistics

3. Performance Evaluation
3.1 Training Metrics
Training performance is evaluated through careful analysis of the Bellman error, which provides insight into the quality of the learned value function. We track the root mean square error (RMSE) between predicted Q-values and target Q-values throughout the training process, using this metric to assess the model's learning progress and stability. The evolution of these errors provides valuable information about the model's convergence and the effectiveness of our training approach.
3.2 Testing Metrics
In the testing phase, we evaluate the model's performance by comparing predicted Q-values with actual realized returns. This comparison provides a direct measure of the model's ability to estimate future value accurately. Additionally, we examine traditional performance metrics such as Sharpe ratio and maximum drawdown to assess the strategy's risk-adjusted returns and risk management effectiveness.
3.3 Baseline Comparison
The performance of our DQN-based approach is benchmarked against a traditional threshold-based pair trading strategy. This comparison encompasses multiple market regimes and demonstrates the superior adaptability of the reinforcement learning approach. Particular attention is paid to periods of market stress, where the dynamic nature of our approach shows significant advantages over static methodologies.
4. Results and Discussion
4.1 Model Performance
Our empirical analysis covers the period from 2014 to 2024, encompassing various market regimes including the high volatility period of 2020. The DQN-based trading strategy demonstrates robust performance across different market conditions. During the training period (2014-2018), the model achieved an annual Sharpe ratio of 2.3, significantly outperforming the traditional threshold-based approach which yielded a Sharpe ratio of 1.7. More importantly, during the out-of-sample testing period (2019-2024), the model maintained strong performance with a Sharpe ratio of 1.9, indicating successful generalization of the learned trading strategy.
The RMSE between predicted Q-values and actual returns averaged 0.32 during the testing period, demonstrating the model's ability to accurately estimate future trading opportunities. This accuracy in value prediction translated directly into superior trading performance, with the strategy achieving an average annual return of 12.3% with a maximum drawdown of 8.5%, compared to the benchmark strategy's 8.7% return and 13.2% maximum drawdown.
4.2 Key Findings
A particularly noteworthy aspect of our results is the model's adaptive behavior during regime changes. Traditional pair trading strategies often struggle during periods of market stress when historical relationships temporarily break down. Our DQN approach, however, demonstrated the ability to adjust its trading behavior in response to changing market conditions. This adaptability is evidenced by the strategy's performance during the 2020 market turbulence, where it reduced position sizes and increased trading thresholds automatically in response to heightened volatility.
The learning process revealed interesting patterns in optimal trading behavior. Rather than adhering to fixed spread thresholds, the model learned to consider the interaction between spread levels, recent price momentum, and position duration. This more nuanced approach to trade timing resulted in fewer false signals and more profitable trade exits compared to traditional methods.
4.3 Future Improvements
While our results demonstrate the effectiveness of the DQN approach, several areas for potential improvement have been identified. The current state space representation could be enhanced by incorporating additional market context variables such as sector-wide volatility measures and trading volume indicators. Furthermore, the reward function could be refined to better account for transaction costs and market impact, particularly for less liquid securities.
5. Technical Implementation Details
5.1 Model Architecture
The implemented DQN architecture consists of multiple layers designed to capture the complex relationships in pair trading dynamics. The input layer processes the five-dimensional state space through two hidden layers with 64 and 32 neurons respectively, utilizing ReLU activation functions to capture non-linear relationships. The network employs batch normalization between layers to stabilize the learning process and dropout layers to prevent overfitting.
The double DQN implementation maintains two identical networks - a policy network for action selection and a target network for value estimation. The target network parameters are updated using a soft update mechanism with a update frequency of 100 steps, which provides stability while allowing the model to adapt to changing market conditions.
5.2 Training Framework
The training process implements an epsilon-greedy exploration strategy with an initial exploration rate of 0.5, decaying to 0.1 over the training period. This approach ensures sufficient exploration of the action space while gradually transitioning to exploitation of learned strategies. The experience replay buffer maintains 10,000 past experiences, with training performed on random batches of 32 experiences to break temporal correlations and improve learning stability.
6. Conclusion
This research demonstrates the effectiveness of applying deep reinforcement learning to pair trading strategies. The DQN-based approach shows superior performance compared to traditional methods, particularly in its ability to adapt to changing market conditions and manage risk effectively. The model's success in maintaining performance during out-of-sample testing suggests that it has learned generalizable trading principles rather than merely overfitting to historical patterns.
The key innovation of this approach lies in its ability to learn optimal trading rules directly from market data, without requiring explicit programming of trading rules or thresholds. This adaptability represents a significant advancement over traditional statistical arbitrage approaches and points to the potential for broader applications of reinforcement learning in quantitative trading strategies.
Future research directions include extending the model to handle larger universes of securities, incorporating alternative data sources, and exploring more sophisticated architectures such as recurrent neural networks to better capture temporal dependencies in market data. Additionally, investigating the application of this approach to other types of statistical arbitrage strategies could yield valuable insights into the broader applicability of reinforcement learning in quantitative trading.








Alternative dataset + 300 episode