In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.stattools import adfuller

ImportError: cannot import name 'datetools'

In [None]:
# load time series
AUD = pd.read_csv("../data/AUD.csv")
EUR = pd.read_csv("../data/EUR.csv")
GBP = pd.read_csv("../data/GBP.csv")
NZD = pd.read_csv("../data/NZD.csv")

In [None]:
NZD

In [None]:
df = AUD.merge(EUR,on='Date').merge(GBP,on='Date').merge(NZD,on='Date')
df.columns = ["Date", "AUD", "EUR", "GBP", "NZD"]

In [None]:
prices_df = df[["AUD", "EUR", "GBP", "NZD"]]

# Pairs selection & Cointegration test

In [None]:
def find_cointegrated_pairs(dataframe):
    # 得到DataFrame长度
    n = dataframe.shape[1]
    # 初始化p值矩阵
    pvalue_matrix = np.ones((n, n))
    # 抽取列的名称
    keys = dataframe.keys()
    # 初始化强协整组
    pairs = []
    # 对于每一个i
    for i in range(n):
        # 对于大于i的j
        for j in range(i+1, n):
            # 获取相应的两只股票的价格Series
            stock1 = dataframe[keys[i]]
            stock2 = dataframe[keys[j]]
            # 分析它们的协整关系
            result = sm.tsa.stattools.coint(stock1, stock2)
            # 取出并记录p值
            pvalue = result[1]
            pvalue_matrix[i, j] = pvalue
            # 如果p值小于0.05
            if pvalue < 0.05:
                # 记录股票对和相应的p值
                pairs.append((keys[i], keys[j], pvalue))
    # 返回结果
    return pvalue_matrix, pairs

In [None]:
pvalues, pairs = find_cointegrated_pairs(prices_df)

In [None]:
stock_list = ["AUD", "EUR", "GBP", "NZD"]

In [None]:
sns.heatmap(1-pvalues, xticklabels=stock_list, yticklabels=stock_list, cmap='RdYlGn_r', mask = (pvalues == 1))
plt.show()

In [None]:
stock_df1 = prices_df["AUD"]
stock_df2 = prices_df["EUR"]
plt.plot(stock_df1)
plt.plot(stock_df2)
plt.xlabel("Time")
plt.ylabel("Price")
plt.legend(["AUD", "EUR"],loc='best')
plt.show()

In [None]:
X = np.array(stock_df1).reshape(-1,1)
y = np.array(stock_df2)
reg = LinearRegression().fit(X, y)
gamma = reg.coef_
u = reg.intercept_
print("r square:", reg.score(X, y))
print("gamma:{}".format(gamma))
print("mu:{}".format(u))

In [None]:
# method2 for regression
x = stock_df1
y = stock_df2
X = sm.add_constant(x)
result = (sm.OLS(y,X)).fit()
print(result.summary())

In [None]:
fig, ax = plt.subplots(figsize=(8,6))
ax.plot(x, y, 'o', label="data")
ax.plot(x, result.fittedvalues, 'r', label="OLS")
ax.legend(loc='best')
plt.show()

In [None]:
plt.plot(0.8864*stock_df1-stock_df2);
plt.axhline((0.8864*stock_df1-stock_df2).mean(), color="red", linestyle="--")
plt.xlabel("Time"); plt.ylabel("Stationary Series")
plt.legend(["Stationary Series", "Mean"])
plt.show()

# tests for the stationarity of $\hat{\epsilon_t}$ (Augmented Dickey-Fuller test)

In [None]:
e = prices_df["EUR"] - prices_df["AUD"]*gamma+u

In [None]:
result = adfuller(e)
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
    print('\t%s: %.3f' % (key, value))

# Trading strategy design - baseline

In [None]:
def zscore(series):
    return (series - series.mean()) / np.std(series)

In [None]:
z = prices_df["EUR"] - prices_df["AUD"]*gamma

### z-score method

In [None]:
z_zscore = zscore(z)
plt.plot(zscore(z))
plt.axhline(zscore(z).mean(), color="black")
plt.axhline(1.0, color="red", linestyle="--")
plt.axhline(-1.0, color="green", linestyle="--")
plt.legend(["z-score", "mean", "+1", "-1"])
plt.show()

Total profit = profit of each trade × number of trades
* profit of each trade is $s_0$
* number of trades is related to the zero crossings, which can be analyzed theoretically as well as empirically.

In [None]:
s0 = 1
spread = -prices_df["EUR"] + prices_df["AUD"]*gamma
profit = [0]*len(z_zscore)
cum_profit = 0
position = [0]*len(z_zscore)
cur_pos = 0
for i in range(1, len(z_zscore)):
    
    if z_zscore[i]<-s0 and cur_pos==0:
        # buy spread
        cum_profit += spread[i]
        cur_pos = 1
    if z_zscore[i]>s0 and cur_pos==0:
        # short-sell spread
        cum_profit -= spread[i]
        cur_pos = -1
    if z_zscore[i]*z_zscore[i-1]<0 and cur_pos==-1: # zero-crossing
        cum_profit -= spread[i]
        cur_pos = 0
    if z_zscore[i]*z_zscore[i-1]<0 and cur_pos==1:
        cum_profit += spread[i]
        cur_pos = 0
    
        
    profit[i] = cum_profit
    position[i] = cur_pos

In [None]:
plt.figure(figsize=(16,9))
plt.subplot(3,1,1)
z_zscore = zscore(z)
plt.plot(zscore(z))
plt.axhline(zscore(z).mean(), color="black")
plt.axhline(1.0, color="red", linestyle="--")
plt.axhline(-1.0, color="green", linestyle="--")
plt.legend(["z-score", "mean", "+1", "-1"])
plt.ylabel("Spread")

plt.subplot(3,1,2)
plt.plot(position, "m")
plt.ylabel("Position")

plt.subplot(3,1,3)
plt.plot(profit, "r")
plt.ylabel("Cum P&L")
plt.show()

The problem with the LS regression is that it assumes that $\mu$ and $\gamma$ are constant.