In [1]:
import pandas as pd
import numpy as np
# from yahoo_fin import options
import yfinance as yf
from datetime import datetime, timedelta
from datetime import date
from dateutil.parser import parse # 데이트 형식 자동변환
from copy import copy
from scipy.spatial import distance
from dateutil.relativedelta import relativedelta

## 1. functions

In [2]:
# 주가, 거래량 데이터 받는 함수
def get_fin_data(ticker, period):
    # yahoo finance에서 데이터 불러오기
    df = pd.DataFrame(yf.download(tickers=ticker, period=period)[['Adj Close','Volume']])
    df.rename(columns = {'Adj Close':'price'},inplace=True)

    return df

In [3]:
# 과거 대비 수익률 데이터 & 미래 수익률 데이터
def cal_return(df, window_sizes, future):
    df_return =copy(df)
    train_col = [] # train data 들어갈 것
    test_col = [] # 실제 비교할 data에 들어갈 것

    # window 사이즈만큼 이동한 주가 데이터
    for window in window_sizes:
        df_return[f'p_b{window}'] = df['price'].shift(window)

    # 과거 대비 로그 평균 수익률 데이터
    for window in window_sizes:
        df_return[f'return_b{window}'] = (np.log(df['price']) - np.log(df_return[f'p_b{window}'])) / window
        train_col.append(f'return_b{window}')

    # 미래만큼 이동한 데이터
    for after in future:
        df_return[f'p_a{after}'] = df['price'].shift(-after)

    # 미래 일반(로그x) 수익률
    for after in future:
        df_return[f'return_a{after}'] = (df_return[f'p_a{after}']) / (df_return['price']) -1
        test_col.append(f'return_a{after}')

    train_df = df_return[train_col]
    test_df = df_return[test_col]

    return train_df, test_df

In [4]:
# 거래량 z-score 구하기
# (data - mean(data, axis=0)) / std(data, axis=0)
def vol_zscore(df, vol_len, scale):
    vol_df = copy(df)
    vol_col = []
    for window in scale:
        vol_df[f'vol_mean{window}'] = df['Volume'].rolling(window).mean()
        vol_df[f'vol_std{window}'] = df['Volume'].rolling(window).std()
        vol_df[f'vol_z{window}'] = (df['Volume'].rolling(vol_len).median() - vol_df[f'vol_mean{window}']) / vol_df[f'vol_std{window}']
        vol_col.append(f'vol_z{window}')

    vol_z_df = vol_df[vol_col]

    return vol_z_df

# 거래량의 스케일된 점수
# 최근 N 기간의 거래량대비 최근 k일간의 거래량으로 scale down
# (data - mean(data, axis=0)) / std(data, axis=0)
def vol_scaled_zscore(df, scale = [20, 40, 60, 80, 100, 120], std_scale = 240, base_N = 1000):
    vol_df = copy(df)
    vol_col = []
    for window in scale:
        vol_mean_scaled = df['Volume'].rolling(window).mean() / df['Volume'].rolling(base_N).mean() # 이 값은 평균 1에서 오실레이트 할 것임.
        vol_df[f'vol_z{window}'] = vol_mean_scaled / vol_mean_scaled.rolling(std_scale).std() # 스케일된 vol_mean을 위에서 오실레이트 하는 수준의 std로 나누기
        vol_col.append(f'vol_z{window}')

    vol_z_df = vol_df[vol_col]

    return vol_z_df

In [5]:
# 유사도 구하는 방식 L1, L2
def calculate_l2_distance(start_value, target_value):
    squared_diff = np.power(start_value - target_value, 2)
    sum_squared_diff = np.sum(squared_diff)
    l2_distance = np.sqrt(sum_squared_diff)
    return l2_distance

def calculate_l1_distance(start_value, target_value):
    abs_diff = np.abs(start_value - target_value)
    l1_distance = np.sum(abs_diff)
    return l1_distance

In [6]:
def extract_closest_indices(df, train_df, raw_target_index, future_window, n=30, distance_metric='l1'):
    df_index = pd.to_datetime(train_df.index) # train_df의 index를 datatime 형식으로.
    distances = {}
    #raw_target_index = pd.to_datetime(raw_target_index)

    # raw target index를 df에 있는 인덱스로 변환
    if raw_target_index not in df_index:
        new_index = df_index[df_index <= raw_target_index][-1]
        print(f"your target date {raw_target_index} is replaced to {new_index}")
        target_index = new_index
    else:
        target_index = copy(raw_target_index)#.copy()

    target_value = train_df.loc[target_index].values#[0]

    # L1 & L2 중 선택한 방법 방식 불러오기
    if distance_metric == 'l1':
        distance_function = calculate_l1_distance
    elif distance_metric == 'l2':
        distance_function = calculate_l2_distance
    else:
        raise ValueError("Invalid distance metric. Use 'l1' or 'l2'.")

    # target index와 과거의 index와 거리 계산
    for index in train_df.index:
        if index != target_index and index < target_index:
            distance = distance_function(train_df.loc[index].values, target_value)
            distances[index] = distance
    # 거리 기준 n 개 추출
    closest_indices = sorted(distances, key=distances.get)[:n]
    returns = np.zeros((len(closest_indices), len(future_window)))

    # 추출된 인덱스의 수익률 계산
    for i, index in enumerate(closest_indices):
      for j, window in enumerate(future_window):
          return_value = (df.at[train_df.index[train_df.index >= index][window], 'price'] - df.at[index, 'price']) / df.at[index, 'price']
          returns[i, j] = return_value

    # 수정 전
    # for i, index in enumerate(closest_indices):
    #     for j, window in enumerate(future_window):
    #         if index + pd.DateOffset(days=window) in train_df.index:
    #             return_value = (df.at[index + pd.DateOffset(days=window), 'price'] - df.at[index, 'price']) / df.at[index, 'price']
    #         else:
    #             closest_date = df_index[df_index <= index][-1]
    #             next_index = df_index[df_index <= (closest_date + pd.DateOffset(days=window))][-1]
    #             return_value = (df.at[next_index, 'price'] - df.at[closest_date, 'price']) / df.at[closest_date, 'price']
    #         returns[i, j] = return_value

    # 각 컬럼의 mean, std

    returns_df = pd.DataFrame(returns, columns=['return_{}'.format(window) for window in future_window])
    mean = returns_df.mean()
    std = returns_df.std()

    results = []
    for col in returns_df.columns:
        results.extend([mean[col], std[col]])

    real_results = []
    for window in future_window:
        # next_date = df_index[df_index <= (target_index + pd.DateOffset(days=window))][-1]
        next_date = train_df.index[train_df.index >= target_index][window]
        # real_return = (df.at[next_date, 'price'] - df.at[target_index, 'price']) / df.at[target_index, 'price']
        real_return = (df.at[next_date, 'price'] / df.at[target_index, 'price']) -1
        real_results.append(real_return)

    # for window in future_window:
    #     # next_date = df_index[df_index <= (target_index + pd.DateOffset(days=window))][-1]
    #     next_date = train_df.index[train_df.index >= target_index][window]
    #     # real_return = (df.at[next_date, 'price'] - df.at[target_index, 'price']) / df.at[target_index, 'price']
    #     real_return = (df.at[next_date, 'price'] / df.at[target_index, 'price']) -1
    #     results.append(real_return)


    return results, real_results

In [7]:
def calculate_returns(start_date, end_date, df, train_df, future_window):
    result_columns = []
    for window in future_window:
        result_columns.extend(['return_{}'.format(window), 'mean_{}'.format(window), 'std_{}'.format(window)])

    result_df = pd.DataFrame(columns=result_columns)

    df_index = pd.to_datetime(df.index)
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)

    if start_date not in df_index:
        closest_start_date = df_index[df_index <= start_date][-1]
        start_date = df_index[df_index == closest_start_date][0]

    if end_date not in df_index:
        closest_end_date = df_index[df_index >= end_date][0]
        end_date = df_index[df_index == closest_end_date][0]

    current_index = start_date

    while current_index <= end_date:
        results, real_results = extract_closest_indices(df, train_df, current_index, future_window)
        for i, window in enumerate(future_window):
            result_df.loc[current_index, 'return_{}'.format(window)] = real_results[i]
            result_df.loc[current_index, 'mean_{}'.format(window)] = results[i*2]
            result_df.loc[current_index, 'std_{}'.format(window)] = results[i*2-1]
            # result_df.loc[current_index, 'return_{}'.format(window)] = results[-(4-i)]
            # result_df.loc[current_index, 'mean_{}'.format(window)] = results[2*i]
            # result_df.loc[current_index, 'std_{}'.format(window)] = results[2*i+1]

        current_index = df_index[df_index >= current_index][0]
        current_index += pd.DateOffset(days=1)

    result_df = result_df[result_df.index.isin(df.index)]

    return result_df

In [8]:
# 실제 수익률이랑 같은 방향인지 체크
def check_sign(x, y):
    if (x * y) > 0:
        return True
    else:
        return False

In [9]:
# 실제 미래 수익률값과 비교해봤을 때 얼마나 잘 맞췄는지 확인
def check_result(return_df, future_window):
    result_dict = {}

    for window in future_window:
        result_dict['result_{}'.format(window)] = {
            'good_{}'.format(window): 0,
            'bad_{}'.format(window): 0,
            'soso_{}'.format(window): 0,
            'large_std_{}'.format(window): 0,
            'plus_error_len'.format(window):0,
            'plus_error_mean'.format(window):0,
            'plus_error_std'.format(window):0,
            'minus_error_len'.format(window):0,
            'minus_error_mean'.format(window):0,
            'minus_error_std'.format(window):0,
            'total_error_mean'.format(window):0,
            'total_error_std'.format(window):0
            }


    for window in future_window:
        # bad, real_return > mean_return
        plus_error = []
        # bad, real_return < mean_return
        minus_error = []
        # total bad
        total_error = []
        for i in range(len(return_df)):
            real_return = return_df['return_{}'.format(window)][i]
            mean_return = return_df['mean_{}'.format(window)][i]
            std = return_df['std_{}'.format(window)][i]
            # mean - std: minus_std, mean + std: plus_std
            minus_std = mean_return - std
            plus_std = mean_return + std

            # 표준편차가 클 때(5% 기준)
            if std >= 0.05:
                result_dict['result_{}'.format(window)]['large_std_{}'.format(window)] += 1
            # 방향성 같은지 체크
            elif check_sign(real_return, mean_return):
                # 방향성 같고, range 안에 있을 때
                if (real_return > minus_std) and (mean_return < plus_std):
                    result_dict['result_{}'.format(window)]['good_{}'.format(window)] += 1
                # 방향성 같은데, range 밖에 있을 때
                else:
                    result_dict['result_{}'.format(window)]['soso_{}'.format(window)] += 1
            else:
                result_dict['result_{}'.format(window)]['bad_{}'.format(window)] += 1
                if real_return> mean_return:
                    plus_error.append(real_return - mean_return)
                else:
                    minus_error.append(real_return - mean_return)
                total_error.append(real_return - mean_return)

        result_dict['result_{}'.format(window)]['plus_error_len'.format(window)]=len(plus_error)
        result_dict['result_{}'.format(window)]['plus_error_mean'.format(window)]=np.mean(plus_error)
        result_dict['result_{}'.format(window)]['plus_error_std'.format(window)]=np.std(plus_error)
        result_dict['result_{}'.format(window)]['minus_error_len'.format(window)]=len(minus_error)
        result_dict['result_{}'.format(window)]['minus_error_mean'.format(window)]=np.mean(minus_error)
        result_dict['result_{}'.format(window)]['minus_error_std'.format(window)]=np.std(minus_error)
        result_dict['result_{}'.format(window)]['total_error_mean'.format(window)]=np.mean(total_error)
        result_dict['result_{}'.format(window)]['total_error_std'.format(window)]=np.std(total_error)

    return result_dict

## 2. data preprocessing

### price data

In [10]:
# download data
ticker = "^IXIC"
period = "max" # period: max, 1y, 5y etc.

nasdaq = get_fin_data(ticker, period)
nasdaq

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,price,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1971-02-05,100.000000,0
1971-02-08,100.839996,0
1971-02-09,100.760002,0
1971-02-10,100.690002,0
1971-02-11,101.449997,0
...,...,...
2023-06-12,13461.919922,4722680000
2023-06-13,13573.320312,5522100000
2023-06-14,13626.480469,5772550000
2023-06-15,13782.820312,5667520000


In [11]:
nasdaq.index = pd.to_datetime(nasdaq.index)

In [12]:
# parameters
today = nasdaq.index[-1]#.date()
year = today - relativedelta(years=20)
window_sizes = [20, 40, 60, 80, 100]
future_window = [5, 10] # future 이후 수익률
sort_num = 30 # 유사도 sorting 할 길이

In [13]:
# window size만큼 과거 대비 수익률, future 만큼 미래 수익률
train_df, test_df = cal_return(nasdaq, window_sizes, future_window)

display(train_df)
test_df

Unnamed: 0_level_0,return_b20,return_b40,return_b60,return_b80,return_b100
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1971-02-05,,,,,
1971-02-08,,,,,
1971-02-09,,,,,
1971-02-10,,,,,
1971-02-11,,,,,
...,...,...,...,...,...
2023-06-12,0.004575,0.002618,0.002313,0.001364,0.002059
2023-06-13,0.004661,0.002754,0.002575,0.001691,0.002237
2023-06-14,0.004946,0.002860,0.002575,0.001812,0.002014
2023-06-15,0.004882,0.003137,0.002504,0.002272,0.001929


Unnamed: 0_level_0,return_a5,return_a10
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1971-02-05,0.020500,-0.003200
1971-02-08,0.013388,-0.011107
1971-02-09,0.009726,-0.001191
1971-02-10,0.007250,0.005363
1971-02-11,-0.007393,-0.001084
...,...,...
2023-06-12,,
2023-06-13,,
2023-06-14,,
2023-06-15,,


### volume data

In [14]:
scale = [100, 200] # 이동평균 날짜 길이
vol_len = 5 # z-score 분자 데이터 이동평균할 날짜 길이
# scale 이동평균 대비 거래량 z-score 구하기
nasdaq_vol_df = vol_zscore(nasdaq, vol_len, scale)
nasdaq_vol_df

Unnamed: 0_level_0,vol_z100,vol_z200
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1971-02-05,,
1971-02-08,,
1971-02-09,,
1971-02-10,,
1971-02-11,,
...,...,...
2023-06-12,-0.399590,-0.228246
2023-06-13,-0.411607,-0.236615
2023-06-14,-0.410190,-0.244606
2023-06-15,0.786760,0.870344


In [15]:
# train_df에 거래량 feature도 추가
train_df = pd.merge(train_df, nasdaq_vol_df, left_index=True, right_index=True)
train_df

Unnamed: 0_level_0,return_b20,return_b40,return_b60,return_b80,return_b100,vol_z100,vol_z200
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1971-02-05,,,,,,,
1971-02-08,,,,,,,
1971-02-09,,,,,,,
1971-02-10,,,,,,,
1971-02-11,,,,,,,
...,...,...,...,...,...,...,...
2023-06-12,0.004575,0.002618,0.002313,0.001364,0.002059,-0.399590,-0.228246
2023-06-13,0.004661,0.002754,0.002575,0.001691,0.002237,-0.411607,-0.236615
2023-06-14,0.004946,0.002860,0.002575,0.001812,0.002014,-0.410190,-0.244606
2023-06-15,0.004882,0.003137,0.002504,0.002272,0.001929,0.786760,0.870344


In [16]:
# 최근 20년 데이터만 추출
train_20df = train_df[year:]

train_20df

Unnamed: 0_level_0,return_b20,return_b40,return_b60,return_b80,return_b100,vol_z100,vol_z200
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2003-06-16,0.003997,0.003906,0.002872,0.002808,0.002037,0.663094,0.744751
2003-06-17,0.005563,0.003954,0.002666,0.002656,0.001838,0.938166,1.039445
2003-06-18,0.005879,0.003615,0.003374,0.002971,0.002228,0.913812,1.023758
2003-06-19,0.005063,0.002933,0.002832,0.002694,0.002183,1.025191,1.150806
2003-06-20,0.004354,0.003026,0.002835,0.002905,0.002033,1.015324,1.145912
...,...,...,...,...,...,...,...
2023-06-12,0.004575,0.002618,0.002313,0.001364,0.002059,-0.399590,-0.228246
2023-06-13,0.004661,0.002754,0.002575,0.001691,0.002237,-0.411607,-0.236615
2023-06-14,0.004946,0.002860,0.002575,0.001812,0.002014,-0.410190,-0.244606
2023-06-15,0.004882,0.003137,0.002504,0.002272,0.001929,0.786760,0.870344


## backtest

In [None]:
# 실제 future_window 이후 수익률, 추출값 future_window만큼 이후 수익률 평균 표준편차 순으로 df 형성 함수
backtest_df = calculate_returns('2016-01-01', '2019-12-31', nasdaq, train_20df, future_window)
backtest_df

your target date 2016-01-01 00:00:00 is replaced to 2015-12-31 00:00:00
your target date 2016-01-09 00:00:00 is replaced to 2016-01-08 00:00:00
your target date 2016-01-16 00:00:00 is replaced to 2016-01-15 00:00:00
your target date 2016-01-23 00:00:00 is replaced to 2016-01-22 00:00:00
your target date 2016-01-30 00:00:00 is replaced to 2016-01-29 00:00:00
your target date 2016-02-06 00:00:00 is replaced to 2016-02-05 00:00:00
your target date 2016-02-13 00:00:00 is replaced to 2016-02-12 00:00:00
your target date 2016-02-20 00:00:00 is replaced to 2016-02-19 00:00:00
your target date 2016-02-27 00:00:00 is replaced to 2016-02-26 00:00:00
your target date 2016-03-05 00:00:00 is replaced to 2016-03-04 00:00:00
your target date 2016-03-12 00:00:00 is replaced to 2016-03-11 00:00:00
your target date 2016-03-19 00:00:00 is replaced to 2016-03-18 00:00:00
your target date 2016-03-25 00:00:00 is replaced to 2016-03-24 00:00:00
your target date 2016-04-02 00:00:00 is replaced to 2016-04-01 0

Unnamed: 0,return_5,mean_5,std_5,return_10,mean_10,std_10
2015-12-31,-0.072648,-0.001854,0.040078,-0.103644,-0.007613,0.032475
2016-01-05,-0.042014,0.012074,0.032068,-0.085811,0.015535,0.021486
2016-01-06,-0.064044,0.004257,0.039821,-0.07521,-0.006035,0.021386
2016-01-07,-0.015872,0.004823,0.022516,-0.020951,0.006968,0.015269
2016-01-08,-0.033424,-0.006674,0.035277,-0.026949,-0.016048,0.02803
...,...,...,...,...,...,...
2019-12-19,0.013435,0.005915,0.02469,0.020732,0.004566,0.016073
2019-12-20,0.002356,0.00056,0.023421,0.016092,0.002104,0.022563
2019-12-24,0.01556,-0.003577,0.026604,0.027985,0.00278,0.034412
2019-12-27,0.0072,0.003214,0.026558,0.029679,0.011159,0.018845


## performance evaluation

In [None]:
check_result(backtest_df, future_window)

{'result_5': {'good_5': 397,
  'bad_5': 323,
  'soso_5': 15,
  'large_std_5': 52,
  'plus_error_len': 144,
  'plus_error_mean': 0.01859223955313741,
  'plus_error_std': 0.011692265425748867,
  'minus_error_len': 179,
  'minus_error_mean': -0.02220969692592402,
  'minus_error_std': 0.017326319355739493,
  'total_error_mean': -0.004019359919778984,
  'total_error_std': 0.025271045915738786},
 'result_10': {'good_10': 436,
  'bad_10': 328,
  'soso_10': 23,
  'large_std_10': 0,
  'plus_error_len': 151,
  'plus_error_mean': 0.029731692850430515,
  'plus_error_std': 0.019135102590517778,
  'minus_error_len': 177,
  'minus_error_mean': -0.032433765347043564,
  'minus_error_std': 0.02242906477237712,
  'total_error_mean': -0.0038149111158893396,
  'total_error_std': 0.03741790026471409}}

## Backtest return

In [17]:
future_window=[10]

In [18]:
backtest_df = calculate_returns('2020-01-01', '2023-04-30', nasdaq, train_20df, future_window)
backtest_df

your target date 2020-01-01 00:00:00 is replaced to 2019-12-31 00:00:00
your target date 2020-01-04 00:00:00 is replaced to 2020-01-03 00:00:00
your target date 2020-01-11 00:00:00 is replaced to 2020-01-10 00:00:00
your target date 2020-01-18 00:00:00 is replaced to 2020-01-17 00:00:00
your target date 2020-01-25 00:00:00 is replaced to 2020-01-24 00:00:00
your target date 2020-02-01 00:00:00 is replaced to 2020-01-31 00:00:00
your target date 2020-02-08 00:00:00 is replaced to 2020-02-07 00:00:00
your target date 2020-02-15 00:00:00 is replaced to 2020-02-14 00:00:00
your target date 2020-02-22 00:00:00 is replaced to 2020-02-21 00:00:00
your target date 2020-02-29 00:00:00 is replaced to 2020-02-28 00:00:00
your target date 2020-03-07 00:00:00 is replaced to 2020-03-06 00:00:00
your target date 2020-03-14 00:00:00 is replaced to 2020-03-13 00:00:00
your target date 2020-03-21 00:00:00 is replaced to 2020-03-20 00:00:00
your target date 2020-03-28 00:00:00 is replaced to 2020-03-27 0

Unnamed: 0,return_10,mean_10,std_10
2019-12-31,0.031886,0.010743,0.025915
2020-01-03,0.040814,0.015175,0.031971
2020-01-07,0.034756,-0.015415,0.038227
2020-01-08,0.02993,-0.015062,0.038907
2020-01-09,0.012113,-0.019457,0.047943
...,...,...,...
2023-04-21,0.013498,-0.00106,0.021881
2023-04-25,0.032239,0.00315,0.022534
2023-04-26,0.038137,0.001877,0.021694
2023-04-27,0.015341,-0.001415,0.025211


In [None]:
def backtest_return(df, window, nasdaq, ):
