## Import Packages

In [1]:
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

import argparse
import datetime
import math

import pandas as pd
import numpy as np

from matplotlib import pyplot as plt

# The above could be sent to an independent module
import backtrader as bt
import backtrader.feeds as btfeeds
import backtrader.indicators as btind

import statsmodels.tsa.stattools as smts
import statsmodels.api as sm

import datetime as dt

import glob
import os

## Read data

In [2]:
nyse_csv_paths = []
nasdaq_csv_paths = []

# get file names of all csv files with nyse stock prices 
nyse_csv_paths = sorted(glob.glob("../ib-data/nyse-daily-tech/*.csv"))
# nasdaq_csv_paths = sorted(glob.glob("../ib-data/nyse/*.csv"))
csv_paths = nyse_csv_paths + nasdaq_csv_paths

N_STOCKS = len(nyse_csv_paths)

In [3]:
data = {}
MIN_SIZE = 252*4
N_STOCKS_TAKEN = 0

for path in csv_paths:
    # get the file names
    filename = os.path.basename(path)
    filename_without_ext = os.path.splitext(filename)[0]
    
    # read the csv file as dataframe
    df = pd.read_csv(path)
    df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d %H:%M:%S')
    
    # if price history is long enough, we take it
    if len(df) >= MIN_SIZE:
        data[filename_without_ext] = df 
        N_STOCKS_TAKEN += 1

In [4]:
print("N_STOCKS = " + str(N_STOCKS))
print("N_STOCKS_TAKEN = " + str(N_STOCKS_TAKEN))

N_STOCKS = 170
N_STOCKS_TAKEN = 116


## Ensure all prices have same start date

In [5]:
# get max starting date
MAX_DATE = pd.Timestamp.min

for key in data.keys():
    # reset index 
    data[key] = data[key].reset_index(drop=True)
    
    # max
    MAX_DATE = max(MAX_DATE, data[key]['date'][0])
    
# take subset of all dataframes
for key in data.keys():
    data[key] = data[key][data[key]['date'] >= MAX_DATE]
    
    # reset index 
    data[key] = data[key].reset_index(drop=True)

## Aggregate prices

In [6]:
# aggregate all close prices
close_price_df = pd.DataFrame()

for key in data.keys():
    close_price_df[key] = data[key]['close']

### Verify

In [7]:
# check if there are null values
close_price_df.isnull().values.any()

False

In [8]:
# close_price_df.columns[close_price_df.isnull().any()]

In [9]:
# close_price_df = close_price_df.dropna(axis='columns')

In [10]:
# data.pop('ARCH', None)
# data.pop('NOA', None)
# data.pop('OBE', None)
# data.pop('WLL', None)

## Get K best pairs

Assuming we take the top 1% pairs ... 

In [11]:
from pair_selector import *

In [12]:
N = len(data.keys())
print(int(N*(N-1)/2), 'pairs')

K = int(0.01 * N * (N-1) / 2)
print(K)

6670 pairs
66


In [13]:
TRAIN_PERIOD = 60

good_pairs = select_pairs_for_all_combin(train_df=close_price_df[0:TRAIN_PERIOD], test_df=None,
        config={'n': K, 'score_function': distance_score, 'series_transform': distance_transform}, plot=False)

good_pairs

{'n': 66, 'score_function': <function distance_score at 0x00000233AFE0D488>, 'series_transform': <function distance_transform at 0x00000233AFE0D2F0>}
True


[('FDS', 'NOW'),
 ('AER', 'MAN'),
 ('AER', 'NSP'),
 ('AER', 'FDS'),
 ('NSP', 'STM'),
 ('AER', 'ELLI'),
 ('AL', 'ASGN'),
 ('AL', 'FDS'),
 ('AER', 'ASGN'),
 ('AL', 'NOW'),
 ('AER', 'AL'),
 ('ASGN', 'FDS'),
 ('HUBS', 'LXFT'),
 ('ELLI', 'LXFT'),
 ('ELLI', 'NSP'),
 ('ELLI', 'PANW'),
 ('ASGN', 'WIT'),
 ('LXFT', 'SQNS'),
 ('ASGN', 'STM'),
 ('EPAM', 'LXFT'),
 ('ELLI', 'FDS'),
 ('ELLI', 'STM'),
 ('AER', 'LXFT'),
 ('AER', 'IPG'),
 ('FDS', 'PAYC'),
 ('AER', 'RHI'),
 ('FDS', 'RHI'),
 ('AER', 'STM'),
 ('STM', 'WIT'),
 ('ASGN', 'ELLI'),
 ('AL', 'MAN'),
 ('FDS', 'GWRE'),
 ('AL', 'RHI'),
 ('HUBS', 'SQNS'),
 ('LXFT', 'MAN'),
 ('AL', 'WIT'),
 ('ASGN', 'NSP'),
 ('JNPR', 'NOW'),
 ('HUBS', 'IPG'),
 ('ELLI', 'HUBS'),
 ('ELLI', 'IPG'),
 ('ASGN', 'RHI'),
 ('LXFT', 'PANW'),
 ('AMN', 'AYR'),
 ('GLOB', 'KAI'),
 ('FDS', 'IPG'),
 ('LXFT', 'TWTR'),
 ('FDS', 'MAN'),
 ('AER', 'TWTR'),
 ('GWRE', 'RHI'),
 ('IPG', 'LXFT'),
 ('FDS', 'WIT'),
 ('ASGN', 'MAN'),
 ('ELLI', 'MODN'),
 ('NSP', 'WIT'),
 ('LXFT', 'MODN'),
 ('EPAM'

## Additional Analyzer

In [14]:
class ReturnStd(bt.Analyzer):
    params = dict(
        period=10,
    )

    def __init__(self):
        super(ReturnStd, self).__init__()
        self.ls = []

    def start(self):
        pass

    def next(self):
        if min(len(self.strategy.data0), len(self.strategy.data1)) >= self.p.period:
            self.ls.append(self.strategy.broker.getvalue())

    def stop(self):
        returns = pd.DataFrame(self.ls).diff()[1:]
        self.std = returns.std()

    def get_std(self):
        return self.std.values[0]

## Strategy

In [15]:
class SinglePair(bt.Strategy):
    params = dict (
        period=84,
        enter_threshold_size = 2,
        exit_threshold_size = 0.5,
        loss_limit = -0.015,
        print_bar = True,
        print_msg = False,
        print_transaction = False,
    )

    def __init__(self):
        self.orderid = None
        
        # Strategy params
        self.period = self.p.period
        self.enter_threshold_size = self.p.enter_threshold_size
        self.exit_threshold_size = self.p.exit_threshold_size
        self.exposure = 200000
        
        # Parameters for printing
        self.print_bar = self.p.print_bar
        self.print_msg = self.p.print_msg
        self.print_transaction = self.p.print_transaction
            
        # signals
        self.zscore = None
        self.adf_pvalue = None
        self.intercept = None
        self.slope = None
        self.resid_mean = None
        self.resid_std = None
        self.spread = None
        
        # temporary variables to keep track of trades
        self.status = 0
        self.qty0 = 0
        self.qty1 = 0
        self.initial_price_data0 = None
        self.initial_price_data1 = None
        self.initial_cash = None
        self.initial_long_pv = None
        self.initial_short_pv = None
        self.upper_limit = None
        self.lower_limit = None
        self.up_medium = None
        self.low_medium = None
        
    def log(self, txt, dt=None):        
        dt = dt or self.data.datetime[0]
        dt = bt.num2date(dt)
        print('%s, %s' % (dt.isoformat(), txt))

    def notify_order(self, order):
        if order.status in [bt.Order.Submitted, bt.Order.Accepted]:
            return  # Await further notifications

        if order.status == order.Completed:
            if order.isbuy():
                buytxt = 'BUY COMPLETE, %.2f' % order.executed.price
                
                if self.print_transaction:
                    self.log(buytxt, order.executed.dt)
                    
                self.incur_commission(order.executed.price, order.executed.size)
            else:
                selltxt = 'SELL COMPLETE, %.2f' % order.executed.price
                
                if self.print_transaction:
                    self.log(selltxt, order.executed.dt)
                    
                self.incur_commission(order.executed.price, order.executed.size)

        elif order.status in [order.Expired, order.Canceled, order.Margin]:
            if self.print_transaction:
                self.log('%s ,' % order.Status[order.status])

        # Allow new orders
        self.orderid = None

    def next(self):
        if min(len(self.data0), len(self.data1)) < self.period:
            return
        
        if self.orderid:
            return  # if an order is active, no new orders are allowed
        
        ##################################################################################################
        # COMPUTE UPPER AND LOWER LIMITS                                                                 #
        ##################################################################################################
        Y = pd.Series(self.data0.get(size=self.period)[0:-1])
        X = pd.Series(self.data1.get(size=self.period)[0:-1])
        
        self.spread = (self.data0[0] - self.data1[0])
        
        if self.status == 0:
            self.spread_mean = (Y - X).mean()
            self.spread_std = (Y - X).std()

            self.upper_limit = self.spread_mean + self.enter_threshold_size * self.spread_std
            self.lower_limit = self.spread_mean - self.enter_threshold_size * self.spread_std
            self.up_medium = self.spread_mean + self.exit_threshold_size * self.spread_std
            self.low_medium = self.spread_mean - self.exit_threshold_size * self.spread_std
    
        ##################################################################################################
        # STRATEGY LOGIC                                                                                 #
        ##################################################################################################
        if self.status == 0:
            # "NO position" status
            
            if self.spread > self.upper_limit:
                self.short_spread()
            elif self.spread < self.lower_limit:
                self.long_spread()
     
        elif self.status == 1:
            # "SHORT the spread" status
            # short data0, long data1
            
            if self.spread < self.lower_limit:
                self.long_spread()
                
            elif self.spread < self.up_medium:
                self.exit_spread()
            
            else:
                long_pv = self.long_portfolio_value(self.data1.close, self.qty1)
                short_pv = self.short_portfolio_value(self.initial_price_data0, self.data0.close, self.qty0)
                net_gain_long = long_pv - self.initial_long_pv
                net_gain_short = short_pv - self.initial_short_pv

                return_of_current_trade = (net_gain_long + net_gain_short) / self.initial_cash

                # if losing too much money, exit
                if return_of_current_trade < self.p.loss_limit or short_pv <= 0:
                    self.exit_spread()
        
        elif self.status == 2:
            # "LONG the spread" status
            # short data1, long data0
            
            if self.spread > self.upper_limit:
                self.short_spread()
                
            elif self.spread > self.low_medium:
                self.exit_spread()
            
            else:
                long_pv = self.long_portfolio_value(self.data0.close, self.qty0)
                short_pv = self.short_portfolio_value(self.initial_price_data1, self.data0.close, self.qty1)
                net_gain_long = long_pv - self.initial_long_pv
                net_gain_short = short_pv - self.initial_short_pv

                return_of_current_trade = (net_gain_long + net_gain_short) / self.initial_cash

                # if losing too much money, exit
                if return_of_current_trade < self.p.loss_limit or short_pv <= 0:
                    self.exit_spread()
    
    def long_portfolio_value(self, price, qty):
        return price * qty
        
    def short_portfolio_value(self, price_initial, price_final, qty):
        return qty * (1.5 * price_initial - price_final)
    
    def short_spread(self):
        x = int((2 * self.broker.getvalue() / 3.0) / (self.data0.close))  
        y = int((2 * self.broker.getvalue() / 3.0) / (self.data1.close))  

        # Placing the order
        self.sell(data=self.data0, size=(x + self.qty0))  # Place an order for buying y + qty2 shares
        self.buy(data=self.data1, size=(y + self.qty1))  # Place an order for selling x + qty1 shares

        # Updating the counters with new value
        self.qty0 = x  
        self.qty1 = y  
        
        # update flags
        self.status = 1
        
        # keep track of trade variables
        self.initial_cash = self.qty1 * self.data1.close + 0.5 * self.qty0 * self.data0.close
        self.initial_long_pv = self.long_portfolio_value(self.qty1, self.data1.close)
        self.initial_short_pv = 0.5 * self.data0.close * self.qty0
        self.initial_price_data0, self.initial_price_data1 = self.data0.close, self.data1.close
    
    def long_spread(self):
        # Calculating the number of shares for each stock
        x = int((2 * self.broker.getvalue() / 3.0) / (self.data0.close)) 
        y = int((2 * self.broker.getvalue() / 3.0) / (self.data1.close)) 
    

        # Place the order
        self.buy(data=self.data0, size=(x + self.qty0))  # Place an order for buying x + qty1 shares
        self.sell(data=self.data1, size=(y + self.qty1))  # Place an order for selling y + qty2 shares

        # Updating the counters with new value
        self.qty0 = x 
        self.qty1 = y 
        
        # update flags
        self.status = 2  
        
        # keep track of trade variables
        self.initial_cash = self.qty0 * self.data0.close + 0.5 * self.qty1 * self.data1.close
        self.initial_long_pv = self.long_portfolio_value(self.qty0, self.data0.close)
        self.initial_short_pv = 0.5 * self.data1.close * self.qty1
        self.initial_price_data0, self.initial_price_data1 = self.data0.close, self.data1.close
    
    def exit_spread(self):
        # Exit position
        self.close(self.data0)
        self.close(self.data1)
        
        # update counters
        self.qty0 = 0
        self.qty1 = 0
        
        # update flags
        self.status = 0
        self.initial_cash = None
        self.initial_long_pv, self.initial_short_pv = None, None
        self.initial_price_data0, self.initial_price_data1 = None, None
        
    def incur_commission(self, price, qty):
        qty = abs(qty)
        commission = min(max(1, 0.005*qty), 0.01*price*qty)
        self.broker.add_cash(-1*commission)
    
    def stop(self):
        if self.print_bar:
            print("-", end="")
        
        if self.print_msg:
            print('==================================================')
            print('Starting Value: %.2f' % self.broker.startingcash)
            print('Ending   Value: %.2f' % self.broker.getvalue())
            print('Number of timeouts: %.2f' % self.n_timeouts)
            print('==================================================')

## Datafeed for pandas

In [16]:
class PandasData(bt.feed.DataBase):
    '''
    The ``dataname`` parameter inherited from ``feed.DataBase`` is the pandas
    DataFrame
    '''

    params = (
        ('datetime', 0),
        ('open', -1),
        ('high', -1),
        ('low', -1),
        ('close', -1),
        ('volume', -1),
        ('openinterest', -1),
    )

## Bulk test

In [17]:
import uuid
import itertools

In [18]:
# global param
TEST_PERIOD = 252*2

# strategy params
lookback_values = [10, 15, 20, 25, 30]
enter_threshold_size = [2, 2.5, 3]
exit_threshold_size = [0.5, 0.75, 1.0]
loss_limit = [-0.005, -0.01, -0.02]

# combinations of parameters
param_combinations = list(itertools.product(lookback_values, enter_threshold_size, exit_threshold_size, loss_limit))

# list to store MACRO results
macro_results = []

for i, params in enumerate(param_combinations, 1):
    # set params
    print("Backtesting all pairs using parameters " + str(params))
    # list to store MICRO results
    results = []

    for pair in good_pairs:
        # get names of both stock
        stk0, stk1 = pair

        # get data of both stock
        stk0_df, stk1_df = data[stk0], data[stk1]
        stk0_df_test, stk1_df_test = stk0_df[TRAIN_PERIOD:TRAIN_PERIOD + TEST_PERIOD], stk1_df[TRAIN_PERIOD:TRAIN_PERIOD + TEST_PERIOD]

        # Create a cerebro
        cerebro = bt.Cerebro()

        # Create data feeds
        data0 = bt.feeds.PandasData(dataname=stk0_df_test, timeframe=(bt.TimeFrame.Days), datetime=0)
        data1 = bt.feeds.PandasData(dataname=stk1_df_test, timeframe=(bt.TimeFrame.Days), datetime=0)

        # add data feeds to cerebro
        cerebro.adddata(data0)
        cerebro.adddata(data1)

        # Add the strategy
        cerebro.addstrategy(SinglePair, 
                            period=params[0], 
                            enter_threshold_size=params[1], 
                            exit_threshold_size=params[2], 
                            loss_limit=params[3])

        # Add analyzers
        cerebro.addanalyzer(bt.analyzers.SharpeRatio, _name='mysharpe')
        cerebro.addanalyzer(ReturnStd, period=params[0], _name='returnStd')

        # Add the commission - only stocks like a for each operation
        cerebro.broker.setcash(1000000)

        # And run it
        strat = cerebro.run()

        # get MICRO metrics
        sharperatio = strat[0].analyzers.mysharpe.get_analysis()['sharperatio']
        returnstd = strat[0].analyzers.returnStd.get_std()
        startcash = cerebro.getbroker().startingcash
        endcash = cerebro.getbroker().getvalue()
        profit = (endcash - startcash) / startcash
        
        results.append((stk0 + "-" + stk1, sharperatio, profit, returnstd))
    
    # convert to dataframe
    results_df = pd.DataFrame(results)
    results_df.columns = ['pair', 'sharpe_ratio', 'overall_return', 'returns_std']
    
    # save as csv
    uuid_str = str(uuid.uuid4())
    path = "../backtest-results/distance/nyse-tech-daily/" + str(uuid_str) + ".csv" 
    results_df.to_csv(path_or_buf=path, index=False)
    
    # calculate MACRO attributes
    avg_sharpe_ratio = results_df['sharpe_ratio'].mean()
    avg_overall_return = results_df['overall_return'].mean()
    overall_return_std = results_df['overall_return'].std()
    
    macro_results.append((params[0], 
                          params[1], 
                          params[2], 
                          params[3],
                          avg_sharpe_ratio, 
                          avg_overall_return,
                          overall_return_std,
                          uuid_str
                         ))
    
    # nextline
    print("")
    print("Completed " + str(i) + "/" + str(len(param_combinations)) + " sets of parameters.")

macro_results_df = pd.DataFrame(macro_results)
macro_results_df.columns = ['lookback', 
                            'enter_threshold_size', 
                            'exit_threshold_size',
                            'loss_limit', 
                            'avg_sharpe_ratio', 
                            'avg_overall_return', 
                            'overall_return_std',
                            'uuid']
macro_results_df.to_csv('../backtest-results/distance/nyse-tech-daily/summary.csv', index=False)

Backtesting all pairs using parameters (10, 2, 0.5, -0.005)
------------------------------------------------------------------
Completed 1/135 sets of parameters.
Backtesting all pairs using parameters (10, 2, 0.5, -0.01)
------------------------------------------------------------------
Completed 2/135 sets of parameters.
Backtesting all pairs using parameters (10, 2, 0.5, -0.02)
------------------------------------------------------------------
Completed 3/135 sets of parameters.
Backtesting all pairs using parameters (10, 2, 0.75, -0.005)
------------------------------------------------------------------
Completed 4/135 sets of parameters.
Backtesting all pairs using parameters (10, 2, 0.75, -0.01)
------------------------------------------------------------------
Completed 5/135 sets of parameters.
Backtesting all pairs using parameters (10, 2, 0.75, -0.02)
------------------------------------------------------------------
Completed 6/135 sets of parameters.
Backtesting all pairs u

------------------------------------------------------------------
Completed 51/135 sets of parameters.
Backtesting all pairs using parameters (15, 3, 1.0, -0.005)
------------------------------------------------------------------
Completed 52/135 sets of parameters.
Backtesting all pairs using parameters (15, 3, 1.0, -0.01)
------------------------------------------------------------------
Completed 53/135 sets of parameters.
Backtesting all pairs using parameters (15, 3, 1.0, -0.02)
------------------------------------------------------------------
Completed 54/135 sets of parameters.
Backtesting all pairs using parameters (20, 2, 0.5, -0.005)
------------------------------------------------------------------
Completed 55/135 sets of parameters.
Backtesting all pairs using parameters (20, 2, 0.5, -0.01)
------------------------------------------------------------------
Completed 56/135 sets of parameters.
Backtesting all pairs using parameters (20, 2, 0.5, -0.02)
--------------------

------------------------------------------------------------------
Completed 101/135 sets of parameters.
Backtesting all pairs using parameters (25, 3, 0.5, -0.02)
------------------------------------------------------------------
Completed 102/135 sets of parameters.
Backtesting all pairs using parameters (25, 3, 0.75, -0.005)
------------------------------------------------------------------
Completed 103/135 sets of parameters.
Backtesting all pairs using parameters (25, 3, 0.75, -0.01)
------------------------------------------------------------------
Completed 104/135 sets of parameters.
Backtesting all pairs using parameters (25, 3, 0.75, -0.02)
------------------------------------------------------------------
Completed 105/135 sets of parameters.
Backtesting all pairs using parameters (25, 3, 1.0, -0.005)
------------------------------------------------------------------
Completed 106/135 sets of parameters.
Backtesting all pairs using parameters (25, 3, 1.0, -0.01)
-----------

In [19]:
macro_results_df

Unnamed: 0,lookback,enter_threshold_size,exit_threshold_size,loss_limit,avg_sharpe_ratio,avg_overall_return,overall_return_std,uuid
0,10,2.0,0.50,-0.005,-0.086510,0.147709,0.454268,cc2ae13f-f97c-45ba-a470-aff62595022a
1,10,2.0,0.50,-0.010,0.072428,0.144182,0.458315,48a6573a-c257-4dec-af78-e2acf093cc60
2,10,2.0,0.50,-0.020,-0.043103,0.106423,0.576931,57f987ea-f457-41a9-a149-b5331c10807c
3,10,2.0,0.75,-0.005,0.054286,0.137921,0.441905,e559859e-c843-43cf-b8cf-a71c374d9b36
4,10,2.0,0.75,-0.010,0.116371,0.133411,0.434172,6f37581f-cc90-4381-affd-74fcaa816ff0
5,10,2.0,0.75,-0.020,-0.037396,0.083439,0.494682,d5dd662f-77cd-43e6-8488-309cde598974
6,10,2.0,1.00,-0.005,-0.005426,0.173065,0.541303,fb34b743-7f83-4322-93b1-776d188a9618
7,10,2.0,1.00,-0.010,0.104021,0.164405,0.513355,2150af37-6537-4e73-96fa-634c90ddea9d
8,10,2.0,1.00,-0.020,-0.069713,0.132370,0.522934,4478b707-4100-4dbb-8888-f9429b4d6854
9,10,2.5,0.50,-0.005,0.089673,0.188130,0.525188,ce682f64-e510-4ed9-a202-52a39216b6ff


In [20]:
macro_results_df[macro_results_df['avg_overall_return'] == max(macro_results_df['avg_overall_return'])]

Unnamed: 0,lookback,enter_threshold_size,exit_threshold_size,loss_limit,avg_sharpe_ratio,avg_overall_return,overall_return_std,uuid
82,25,2.0,0.5,-0.01,0.66828,0.249257,0.453489,111434e3-d064-4989-ab85-d759b026671c
