In [25]:
import os
import glob

import pandas as pd
import numpy as np

import statsmodels.tsa.stattools as smts
import statsmodels.api as sm    

from matplotlib import pyplot as plt

In [10]:
def price_spread_is_stationary(p1, p2, sig_level = 0.05):
    Y, X = pd.Series(p1), pd.Series(p2)
    adf_pvalue = smts.adfuller(Y - X)[1]

    return adf_pvalue < sig_level

def log_price_spread_is_stationary(p1, p2, sig_level = 0.05):
    Y, X = pd.Series(np.log(p1)), pd.Series(np.log(p2)) 
    adf_pvalue = smts.adfuller(Y - X)[1]

    return adf_pvalue < sig_level

In [61]:
df1 = pd.read_csv('../ib-data/iex-features/GOOGL.csv')
df2 = pd.read_csv('../ib-data/iex-features/GOOG.csv')

In [9]:
# price_spread_is_stationary(df1['close'][0:2000], df2['close'][0:2000])
# log_price_spread_is_stationary(df1['close'][0:2000], df2['close'][0:2000])

In [62]:
# log Y = intercept + log X + c
def compute_intercept_and_pvalue(p1, p2):
    Y, X = pd.Series(np.log(p1)), pd.Series(np.log(p2))
    
    # returns (intercept, pvalue)
    return (np.mean(Y - X), smts.adfuller(Y - X)[1])

def generate_pair_df(df1, df2, training_period = 84*20):
    dt1, dt2 = df1['date'][0], df2['date'][0]
    dt = max(dt1, dt2)
    
    p1_start, p2_start = df1[df1['date'] >= dt].index[0], df2[df2['date'] >= dt].index[0]
    
    df1_train, df1_test = df1[p1_start:p1_start + training_period], df1[p1_start + training_period:]
    df2_train, df2_test = df2[p2_start:p2_start + training_period], df2[p2_start + training_period:]
    
    intercept, pvalue = compute_intercept_and_pvalue(df1_train['close'], df2_train['close'])
    
    df_combined = pd.DataFrame()
    
    # date
    df_combined["date"] = df1_test["date"]
    
    # datetime features
    df_combined['year'] = df1_test['year']
    df_combined['monthOfYear'] = df1_test['monthOfYear']
    df_combined['dayOfMonth'] = df1_test['dayOfMonth']
    df_combined['hourOfDay'] = df1_test['hourOfDay']
    df_combined['minuteOfHour'] = df1_test['minuteOfHour']
    df_combined['dayOfWeek'] = df1_test['dayOfWeek']
    df_combined['dayOfYear'] = df1_test['dayOfYear']
    df_combined['weekOfYear'] = df1_test['weekOfYear']
    df_combined['isHoliday'] = df1_test['isHoliday']
    df_combined['prevDayIsHoliday'] = df1_test['prevDayIsHoliday']
    df_combined['nextDayIsHoliday'] = df1_test['nextDayIsHoliday']

    # spread and pvalue
    df_combined["spread"] = pd.Series(np.log(df1_test['close']) - np.log(df2_test['close']) - intercept)
    df_combined["pvalue"] = pvalue
    
    # price information of both stocks
    df_combined["open1"] = df1_test["open"]
    df_combined["high1"] = df1_test["high"]
    df_combined["low1"] = df1_test["low"]
    df_combined["close1"] = df1_test["close"]
                
    df_combined["open2"] = df2_test["open"]
    df_combined["high2"] = df2_test["high"]
    df_combined["low2"] = df2_test["low"]
    df_combined["close2"] = df2_test["close"]
    
    
    return df_combined  

In [63]:
compute_intercept_and_pvalue(df1['close'][0:100], df2['close'][0:100])

(0.010596148788664595, 0.098684168451024346)

In [64]:
df3 = generate_pair_df(df1, df2)
df3.to_csv(path_or_buf="../ib-data/pair-features/GOOGL-GOOG.csv", index=False)