In [1]:
import pandas as pd
import numpy as np
# from yahoo_fin import options
import yfinance as yf
from datetime import datetime, timedelta
from datetime import date
from dateutil.parser import parse # 데이트 형식 자동변환
from copy import copy
from scipy.spatial import distance
from dateutil.relativedelta import relativedelta

## 1. functions

In [2]:
# 주가, 거래량 데이터 받는 함수
def get_fin_data(ticker, period):
    # yahoo finance에서 데이터 불러오기
    df = pd.DataFrame(yf.download(tickers=ticker, period=period)[['Adj Close','Volume']])
    df.rename(columns = {'Adj Close':'price'},inplace=True)
    
    return df

In [3]:
# 과거 대비 수익률 데이터 & 미래 수익률 데이터
def cal_return(df, window_sizes, future):
    df_return =copy(df)
    train_col = [] # train data 들어갈 것
    test_col = [] # 실제 비교할 data에 들어갈 것
    
    # window 사이즈만큼 이동한 주가 데이터
    for window in window_sizes:
        df_return[f'p_b{window}'] = df['price'].shift(window)
    
    # 과거 대비 로그 평균 수익률 데이터 
    for window in window_sizes:
        df_return[f'return_b{window}'] = (np.log(df['price']) - np.log(df_return[f'p_b{window}'])) / window  
        train_col.append(f'return_b{window}')
        
    # 미래만큼 이동한 데이터
    for after in future:
        df_return[f'p_a{after}'] = df['price'].shift(-after)
        
    # 미래 일반(로그x) 수익률
    for after in future:
        df_return[f'return_a{after}'] = (df_return[f'p_a{after}']) / (df_return['price']) -1
        test_col.append(f'return_a{after}')
    
    train_df = df_return[train_col]
    test_df = df_return[test_col]
    
    return train_df, test_df

In [4]:
# 거래량 z-score 구하기
# (data - mean(data, axis=0)) / std(data, axis=0)
def vol_zscore(df, vol_len, scale):
    vol_df = copy(df)
    vol_col = []
    for window in scale:
        vol_df[f'vol_mean{window}'] = df['Volume'].rolling(window).mean()
        vol_df[f'vol_std{window}'] = df['Volume'].rolling(window).std()
        vol_df[f'vol_z{window}'] = (df['Volume'].rolling(vol_len).median() - vol_df[f'vol_mean{window}']) / vol_df[f'vol_std{window}']
        vol_col.append(f'vol_z{window}')
    
    vol_z_df = vol_df[vol_col]
    
    return vol_z_df

# 거래량의 스케일된 점수
# 최근 N 기간의 거래량대비 최근 k일간의 거래량으로 scale down
# (data - mean(data, axis=0)) / std(data, axis=0)
def vol_scaled_zscore(df, scale = [20, 40, 60, 80, 100, 120], std_scale = 240, base_N = 1000):
    vol_df = copy(df)
    vol_col = []
    for window in scale:
        vol_mean_scaled = df['Volume'].rolling(window).mean() / df['Volume'].rolling(base_N).mean() # 이 값은 평균 1에서 오실레이트 할 것임.
        vol_df[f'vol_z{window}'] = vol_mean_scaled / vol_mean_scaled.rolling(std_scale).std() # 스케일된 vol_mean을 위에서 오실레이트 하는 수준의 std로 나누기
        vol_col.append(f'vol_z{window}')
    
    vol_z_df = vol_df[vol_col]
    
    return vol_z_df

In [5]:
# 유사도 구하는 방식 L1, L2
def calculate_l2_distance(start_value, target_value):
    squared_diff = np.power(start_value - target_value, 2)
    sum_squared_diff = np.sum(squared_diff)
    l2_distance = np.sqrt(sum_squared_diff)
    return l2_distance

def calculate_l1_distance(start_value, target_value):
    abs_diff = np.abs(start_value - target_value)
    l1_distance = np.sum(abs_diff)
    return l1_distance

In [20]:
def extract_closest_indices(df, nasdaq, raw_target_index, future_window, n=30, distance_metric='l1'):
    df_index = pd.to_datetime(df.index)
    distances = {}
    raw_target_index = pd.to_datetime(raw_target_index)
    
    if raw_target_index not in df_index:
        new_index = df_index[df_index <= raw_target_index][-1]
        print(f"your target date {raw_target_index} is replaced to {new_index}")
        target_index = new_index
    else:
        target_index = copy(raw_target_index)#.copy()
    
    target_value = df.loc[target_index].values[0]
    
    # L1 & L2 중 선택한 방법 방식 불러오기
    if distance_metric == 'l1':
        distance_function = calculate_l1_distance
    elif distance_metric == 'l2':
        distance_function = calculate_l2_distance
    else:
        raise ValueError("Invalid distance metric. Use 'l1' or 'l2'.")
    
    for index in df.index:
        if index != target_index and index < target_index:
            distance = abs(df.loc[index].values[0] - target_value)
            distances[index] = distance

    closest_indices = sorted(distances, key=distances.get)[:n]
    returns = np.zeros((len(closest_indices), len(future_window)))

    for i, index in enumerate(closest_indices):
        for j, window in enumerate(future_window):
            if index + pd.DateOffset(days=window) in df.index:
                return_value = (nasdaq.at[index + pd.DateOffset(days=window), 'price'] - nasdaq.at[index, 'price']) / nasdaq.at[index, 'price']
            else:
                closest_date = df_index[df_index <= index][-1]
                next_index = df_index[df_index <= (closest_date + pd.DateOffset(days=window))][-1]
                return_value = (nasdaq.at[next_index, 'price'] - nasdaq.at[closest_date, 'price']) / nasdaq.at[closest_date, 'price']
            returns[i, j] = return_value

    returns_df = pd.DataFrame(returns, columns=['return_{}'.format(window) for window in future_window])
    mean = returns_df.mean()
    std = returns_df.std()
    
    results = []
    for col in returns_df.columns:
        results.extend([mean[col], std[col]])
    
    for window in future_window:
        next_date = df_index[df_index <= (target_index + pd.DateOffset(days=window))][-1]
        return_ = (nasdaq.at[next_date, 'price'] - nasdaq.at[target_index, 'price']) / nasdaq.at[target_index, 'price']
        results.append(return_)


    return results

In [18]:
def calculate_returns(start_date, end_date, df, train_df, future_window):
    result_columns = []
    for window in future_window:
        result_columns.extend(['return_{}'.format(window), 'mean_{}'.format(window), 'std_{}'.format(window)])

    result_df = pd.DataFrame(columns=result_columns)
    
    df_index = pd.to_datetime(df.index)
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)
    
    if start_date not in df_index:
        closest_start_date = df_index[df_index <= start_date][-1]
        start_date = df_index[df_index == closest_start_date][0]
        
    if end_date not in df_index:
        closest_end_date = df_index[df_index >= end_date][0]
        end_date = df_index[df_index == closest_end_date][0]
    
    current_index = start_date

    while current_index <= end_date:
        results = extract_closest_indices(train_df, df, current_index, future_window)
        for i, window in enumerate(future_window):
            next_date = current_index + pd.DateOffset(days=window)
            result_df.loc[current_index, 'return_{}'.format(window)] = results[-(4-i)]
            result_df.loc[current_index, 'mean_{}'.format(window)] = results[2*i]
            result_df.loc[current_index, 'std_{}'.format(window)] = results[2*i+1]

        current_index = df_index[df_index >= current_index][0]
        current_index += pd.DateOffset(days=1)
    
    result_df = result_df[result_df.index.isin(nasdaq.index)]

    return result_df

In [8]:
# 실제 수익률이랑 같은 방향인지 체크
def check_sign(x, y):
    if (x * y) > 0:
        return True
    else:
        return False

In [9]:
# 실제 미래 수익률값과 비교해봤을 때 얼마나 잘 맞췄는지 확인
def check_result(return_df, future_window):
    result_dict = {}
    
    for window in future_window:
        result_dict['result_{}'.format(window)] = {
            'good_{}'.format(window): 0, 
            'bad_{}'.format(window): 0, 
            'soso_{}'.format(window): 0, 
            'large_std_{}'.format(window): 0
            }

    for i in range(len(return_df)):
        for window in future_window:
            real_return = return_df['return_{}'.format(window)][i]
            mean_return = return_df['mean_{}'.format(window)][i]
            std = return_df['std_{}'.format(window)][i]
            # mean - std: minus_std, mean + std: plus_std
            minus_std = mean_return - std
            plus_std = mean_return + std
            
            # 표준편차가 클 때(5% 기준)
            if std >= 0.05:
                result_dict['result_{}'.format(window)]['large_std_{}'.format(window)] += 1
            # 방향성 같은지 체크
            elif check_sign(real_return, mean_return):
                # 방향성 같고, range 안에 있을 때
                if (real_return > minus_std) and (mean_return < plus_std):
                    result_dict['result_{}'.format(window)]['good_{}'.format(window)] += 1
                # 방향성 같은데, range 밖에 있을 때
                else:
                    result_dict['result_{}'.format(window)]['soso_{}'.format(window)] += 1
            else:
                result_dict['result_{}'.format(window)]['bad_{}'.format(window)] += 1

    return result_dict

## 2. data preprocessing

### price data

In [10]:
# download data
ticker = "^IXIC"
period = "max" # period: max, 1y, 5y etc.

nasdaq = get_fin_data(ticker, period)
nasdaq

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,price,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1971-02-05,100.000000,0
1971-02-08,100.839996,0
1971-02-09,100.760002,0
1971-02-10,100.690002,0
1971-02-11,101.449997,0
...,...,...
2023-06-05,13229.429688,4344280000
2023-06-06,13276.419922,4810910000
2023-06-07,13104.900391,5270600000
2023-06-08,13238.519531,4280160000


In [11]:
nasdaq.index = pd.to_datetime(nasdaq.index)

In [12]:
# parameters
today = nasdaq.index[-1]#.date()
year = today - relativedelta(years=20)
window_sizes = [20, 40, 60, 80, 100]
future_window = [5, 10] # future 이후 수익률
sort_num = 30 # 유사도 sorting 할 길이

In [13]:
# window size만큼 과거 대비 수익률, future 만큼 미래 수익률
train_df, test_df = cal_return(nasdaq, window_sizes, future_window)

display(train_df)
test_df

Unnamed: 0_level_0,return_b20,return_b40,return_b60,return_b80,return_b100
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1971-02-05,,,,,
1971-02-08,,,,,
1971-02-09,,,,,
1971-02-10,,,,,
1971-02-11,,,,,
...,...,...,...,...,...
2023-06-05,0.003905,0.002256,0.002571,0.001313,0.002082
2023-06-06,0.003995,0.002352,0.002926,0.001485,0.001943
2023-06-07,0.003661,0.002136,0.002634,0.001398,0.001750
2023-06-08,0.003650,0.002603,0.002451,0.001341,0.001781


Unnamed: 0_level_0,return_a5,return_a10
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1971-02-05,0.020500,-0.003200
1971-02-08,0.013388,-0.011107
1971-02-09,0.009726,-0.001191
1971-02-10,0.007250,0.005363
1971-02-11,-0.007393,-0.001084
...,...,...
2023-06-05,,
2023-06-06,,
2023-06-07,,
2023-06-08,,


### volume data

In [14]:
scale = [100, 200] # 이동평균 날짜 길이
vol_len = 5 # z-score 분자 데이터 이동평균할 날짜 길이
# scale 이동평균 대비 거래량 z-score 구하기
nasdaq_vol_df = vol_zscore(nasdaq, vol_len, scale)
nasdaq_vol_df

Unnamed: 0_level_0,vol_z100,vol_z200
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1971-02-05,,
1971-02-08,,
1971-02-09,,
1971-02-10,,
1971-02-11,,
...,...,...
2023-06-05,-0.613145,-0.368921
2023-06-06,-0.606274,-0.369561
2023-06-07,-0.602718,-0.373887
2023-06-08,-0.860234,-0.631097


In [15]:
# train_df에 거래량 feature도 추가 
train_df = pd.merge(train_df, nasdaq_vol_df, left_index=True, right_index=True)
train_df

Unnamed: 0_level_0,return_b20,return_b40,return_b60,return_b80,return_b100,vol_z100,vol_z200
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1971-02-05,,,,,,,
1971-02-08,,,,,,,
1971-02-09,,,,,,,
1971-02-10,,,,,,,
1971-02-11,,,,,,,
...,...,...,...,...,...,...,...
2023-06-05,0.003905,0.002256,0.002571,0.001313,0.002082,-0.613145,-0.368921
2023-06-06,0.003995,0.002352,0.002926,0.001485,0.001943,-0.606274,-0.369561
2023-06-07,0.003661,0.002136,0.002634,0.001398,0.001750,-0.602718,-0.373887
2023-06-08,0.003650,0.002603,0.002451,0.001341,0.001781,-0.860234,-0.631097


In [16]:
# 최근 20년 데이터만 추출
train_20df = train_df[year:]

train_20df

Unnamed: 0_level_0,return_b20,return_b40,return_b60,return_b80,return_b100,vol_z100,vol_z200
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2003-06-09,0.002684,0.004022,0.002987,0.002830,0.000934,2.627074,2.751617
2003-06-10,0.002723,0.004513,0.003237,0.003029,0.001233,2.613307,2.746539
2003-06-11,0.003339,0.004317,0.002790,0.002853,0.001451,1.056183,1.136145
2003-06-12,0.003725,0.004323,0.002768,0.002568,0.001836,0.821179,0.895860
2003-06-13,0.002364,0.003843,0.002534,0.002475,0.001758,0.678736,0.754648
...,...,...,...,...,...,...,...
2023-06-05,0.003905,0.002256,0.002571,0.001313,0.002082,-0.613145,-0.368921
2023-06-06,0.003995,0.002352,0.002926,0.001485,0.001943,-0.606274,-0.369561
2023-06-07,0.003661,0.002136,0.002634,0.001398,0.001750,-0.602718,-0.373887
2023-06-08,0.003650,0.002603,0.002451,0.001341,0.001781,-0.860234,-0.631097


## backtest

In [21]:
# 실제 future_window 이후 수익률, 추출값 future_window만큼 이후 수익률 평균 표준편차 순으로 df 형성 함수
backtest_df = calculate_returns('2023-01-01', '2023-01-15', nasdaq, train_20df, future_window)
backtest_df

your target date 2022-12-31 00:00:00 is replaced to 2022-12-30 00:00:00
your target date 2023-01-07 00:00:00 is replaced to 2023-01-06 00:00:00
your target date 2023-01-14 00:00:00 is replaced to 2023-01-13 00:00:00


Unnamed: 0,return_5,mean_5,std_5,return_10,mean_10,std_10
2022-12-30,0.004448,0.004706,0.037794,0.05829,0.004448,0.05829
2023-01-04,0.003551,0.002253,0.028429,0.036664,0.003551,0.036664
2023-01-05,0.005369,-0.000304,0.023302,0.031935,0.005369,0.031935
2023-01-06,0.007648,0.005598,0.019563,0.031777,0.007648,0.031777
2023-01-10,-0.004396,-0.005233,0.027027,0.035054,-0.004396,0.035054
2023-01-11,-0.005275,-0.000603,0.0241,0.032702,-0.005275,0.032702
2023-01-12,-0.00866,-0.006049,0.023066,0.034409,-0.00866,0.034409
2023-01-13,-0.001875,0.000895,0.015754,0.024633,-0.001875,0.024633


## performance evaluation

In [22]:
check_result(backtest_df, future_window)

{'result_5': {'good_5': 6, 'bad_5': 2, 'soso_5': 0, 'large_std_5': 0},
 'result_10': {'good_10': 3, 'bad_10': 4, 'soso_10': 0, 'large_std_10': 1}}