# Pair Trading
1. 종목군 선택(코스피 ETF & 코스닥 ETF)
2. 날짜 선택(시작, 끝 날짜)
3. 비율 선정(회귀 분석의 계수를 통해 비율 선정)
4. Spread 산정
5. Spread의 Stationary 확인(Cointegration 확인 By ADF Test)

- KODEX 코스피 & KODEX 코스닥 150

In [20]:
import pandas as pd
from statsmodels.tsa.stattools import adfuller
import plotly.graph_objects as go
from statsmodels import regression

def adf_test(df, critical_value):
    p_value = adfuller(df)[1]

    if p_value < critical_value:
        print('P-value of Spread : {}'.format(p_value))
        print('TimeSeries Data is Stationary')
        return True
    else:
        print('P-value of Spread : {}'.format(p_value))
        print('TimeSeries Data is Non-Stationary')
        return False


In [7]:
df = pd.read_excel('../Data/df_etf.xlsx',index_col=0)
df_copy = df.copy()

condition = [df_copy.columns[i] for i in range(df_copy.shape[1]) if 'KODEX' in df_copy.columns[i]]
df_copy_kodex=df_copy.loc[:,condition]

## 1. Pair 선정(KODEX 코스피 & KODEX 코스닥 150)

In [None]:
etf_1 = 'KODEX 코스피'
etf_2 = 'KODEX 코스닥150'

start_date = '2021-01-01'
end_date = '2021-08-01'

df_copy_kodex_sample = df_copy_kodex.loc[start_date:end_date,[etf_1,etf_2]].dropna()
df_copy_kodex_sample

Unnamed: 0_level_0,KODEX 코스피,KODEX 코스닥150
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-01-04,29391.0,15250.0
2021-01-05,29873.0,15260.0
2021-01-06,29730.0,15131.0
2021-01-07,30355.0,15211.0
2021-01-08,31463.0,15320.0
...,...,...
2021-07-26,32455.0,14850.0
2021-07-27,32530.0,14780.0
2021-07-28,32580.0,14585.0
2021-07-29,32545.0,14700.0


In [18]:
## Regression 모델로 비율을 정하기 (머신러닝 모델로도 정함)

regression_model = regression.linear_model.OLS(df_copy_kodex_sample[etf_1],df_copy_kodex_sample[etf_2]) # y,x 순서(sklearn이랑은 반대)
fitted_model=regression_model.fit() # sklearn과 다르게 객체를 생성새준다 -> summary()가 가능!
fitted_model.params.iloc[0]


np.float64(2.195190500310976)

In [19]:
# 해당 회귀계수를 이용하여 비율을 구하고 Spread를 구하기

spread = df_copy_kodex_sample[etf_1] - fitted_model.params.iloc[0] * df_copy_kodex_sample[etf_2]


In [21]:
# adf_test를 통하여 spread가 정상성인지 확인 후 -> 두 데이터 셋이 공적분 관계인지 확인

adf_test(spread,critical_value=0.05)

P-value of Spread : 0.015095139812408447
TimeSeries Data is Stationary


True

In [24]:
# 그래프로 확인해보기

data_mean = [spread.mean() for i in range(spread.shape[0])]

trace1 = go.Scatter(x=spread.index,
                    y=spread,
                    mode='lines',
                    name='spread')

trace2= go.Scatter(x=spread.index,
                    y=data_mean,
                    mode='lines',
                    name='Spread Mean')

layout = go.Layout(title='Spread of {} & {}'.format(etf_1,etf_2),
                   yaxis={'tickformat':','})
fig= go.Figure([trace1,trace2],layout)

fig.update_layout(template='plotly_dark')

fig.show()


## 2. Pair 선정(KODEX 코스피 & KODEX 반도체)
- 코스피의 구성요소에 반도체가 많다!!!

In [36]:
etf_1 = 'KODEX 코스피'
etf_2 = 'KODEX 반도체'

start_date = '2020-01-01'
end_date = '2020-08-01'

df_copy_kodex_sample = df_copy_kodex.loc[start_date:end_date,[etf_1,etf_2]].dropna()

## Regression 모델로 비율을 정하기 (머신러닝 모델로도 정함)

regression_model = regression.linear_model.OLS(df_copy_kodex_sample[etf_1],df_copy_kodex_sample[etf_2]) # y,x 순서(sklearn이랑은 반대)
fitted_model=regression_model.fit() 
fitted_model.params.iloc[0]

# 해당 회귀계수를 이용하여 비율을 구하고 Spread를 구하기
spread = df_copy_kodex_sample[etf_1] - fitted_model.params.iloc[0] * df_copy_kodex_sample[etf_2]

print(adf_test(spread,critical_value=0.05))

# 그래프로 확인해보기

data_mean = [spread.mean() for i in range(spread.shape[0])]

trace1 = go.Scatter(x=spread.index,
                    y=spread,
                    mode='lines',
                    name='spread')

trace2= go.Scatter(x=spread.index,
                    y=data_mean,
                    mode='lines',
                    name='Spread Mean')

layout = go.Layout(title='Spread of {} & {}'.format(etf_1,etf_2),
                   yaxis={'tickformat':','})
fig= go.Figure([trace1,trace2],layout)

fig.update_layout(template='plotly_dark',
                  width=1000,
                  height=600)

fig.show()


P-value of Spread : 0.022628213839828434
TimeSeries Data is Stationary
True


## 3. Pair 선정(Tiger 미국달러선물레버리지 & Tiger 일본엔선물물)

In [37]:
condition = [df_copy.columns[i] for i in range(df_copy.shape[1]) if 'TIGER' in df_copy.columns[i]]
df_copy_tiger=df_copy.loc[:,condition]

etf_1 = 'TIGER 미국달러선물레버리지'
etf_2 = 'TIGER 일본엔선물'

start_date = '2020-01-01'
end_date = '2020-08-01'

df_copy_tiger_sample = df_copy_tiger.loc[start_date:end_date,[etf_1,etf_2]].dropna()

## Regression 모델로 비율을 정하기 (머신러닝 모델로도 정함)

regression_model = regression.linear_model.OLS(df_copy_tiger_sample[etf_1],df_copy_tiger_sample[etf_2]) # y,x 순서(sklearn이랑은 반대)
fitted_model=regression_model.fit() 
fitted_model.params.iloc[0]

# 해당 회귀계수를 이용하여 비율을 구하고 Spread를 구하기
spread = df_copy_tiger_sample[etf_1] - fitted_model.params.iloc[0] * df_copy_tiger_sample[etf_2]

print(adf_test(spread,critical_value=0.05))

# 그래프로 확인해보기

data_mean = [spread.mean() for i in range(spread.shape[0])]

trace1 = go.Scatter(x=spread.index,
                    y=spread,
                    mode='lines',
                    name='spread')

trace2= go.Scatter(x=spread.index,
                    y=data_mean,
                    mode='lines',
                    name='Spread Mean')

layout = go.Layout(title='Spread of {} & {}'.format(etf_1,etf_2),
                   yaxis={'tickformat':','})
fig= go.Figure([trace1,trace2],layout)

fig.update_layout(template='plotly_dark',
                  width=1000,
                  height=600)

fig.show()


P-value of Spread : 0.005258360529205269
TimeSeries Data is Stationary
True
