In [1]:
import pandas as pd
import numpy as np
# from yahoo_fin import options
import yfinance as yf
from datetime import datetime, timedelta
from datetime import date
from dateutil.parser import parse # 데이트 형식 자동변환
from copy import copy
from scipy.spatial import distance
from dateutil.relativedelta import relativedelta

In [2]:
def inequal_neg_mae(y_true, y_pred):
    gamma = 0.95
    sign_y_true = np.sign(y_true)
    loss =  sign_y_true * np.where(
        sign_y_true * (y_pred - y_true/2) >0, 
        (1 - gamma) * sign_y_true * np.abs(y_pred - y_true),
        - y_pred + (1 - gamma/2) * y_true 
        ) 
    # capped_loss = np.where(loss > 0.05, 0.05, loss)
    return 1 * loss

In [3]:
def show_score(func):
    l_pred_val = np.linspace(-0.2, 0.2, 1000)
    l_high_val = -0.1 * np.ones(1000)
    l_str_val = - 0.05 * np.ones(1000)
    l_med_val =  0.1 * np.ones(1000)
    l_weak_val = 0.05 * np.ones(1000)
    l_val = [l_high_val, l_str_val, l_med_val, l_weak_val]
    l_cols = ['red', 'darkorange', 'limegreen', 'lightseagreen']
    l_names= ['true val = ' +str(i[0]) for i in l_val]

    fig = plt.subplots(figsize=(4.5,4))
    # ax.axis('off')
    plt.axvline(x=0, color='k', linewidth=1)
    plt.axhline(y=0, color='k', linewidth=1)
    plt.xlim(-0.15, 0.15)
    plt.ylim(-0.15, 0.15)
    for true_type_, col_, name_ in zip(l_val, l_cols, l_names):
        plt.plot(l_pred_val, func(true_type_, l_pred_val), color = col_, label=name_)
        plt.xlabel('predict', fontsize=20)
        plt.ylabel(func.__name__, fontsize=20)
    #     ax.plot(l_pred_val, inequal_neg_mae(-true_type_, l_pred_val), color = col_)
    plt.legend()
    plt.show()
    return

##### functions

In [4]:
# 주가, 거래량 데이터 받는 함수
def get_fin_data(ticker, period):
    # yahoo finance에서 데이터 불러오기
    df = pd.DataFrame(yf.download(tickers=ticker, period=period)[['Adj Close','Volume']])
    df.rename(columns = {'Adj Close':'price'},inplace=True)
    
    return df

In [5]:
# 과거 대비 수익률 데이터 & 미래 수익률 데이터
def cal_return(df, window_sizes, future):
    df_return =copy(df)
    train_col = [] # train data 들어갈 것
    test_col = [] # 실제 비교할 data에 들어갈 것
    
    # window 사이즈만큼 이동한 주가 데이터
    for window in window_sizes:
        df_return[f'p_b{window}'] = df['price'].shift(window)
    
    # 과거 대비 로그 평균 수익률 데이터 
    for window in window_sizes:
        df_return[f'return_b{window}'] = (np.log(df['price']) - np.log(df_return[f'p_b{window}'])) / window  
        train_col.append(f'return_b{window}')
        
    # 미래만큼 이동한 데이터
    for after in future:
        df_return[f'p_a{after}'] = df['price'].shift(-after)
        
    # 미래 일반(로그x) 수익률
    for after in future:
        df_return[f'return_a{after}'] = (df_return[f'p_a{after}']) / (df_return['price']) -1
        test_col.append(f'return_a{after}')
    
    train_df = df_return[train_col]
    test_df = df_return[test_col]
    
    return train_df, test_df

In [6]:
# 거래량 z-score 구하기
# (data - mean(data, axis=0)) / std(data, axis=0)
def vol_zscore(df, vol_len, scale):
    vol_df = copy(df)
    vol_col = []
    for window in scale:
        vol_df[f'vol_mean{window}'] = df['Volume'].rolling(window).mean()
        vol_df[f'vol_std{window}'] = df['Volume'].rolling(window).std()
        vol_df[f'vol_z{window}'] = (df['Volume'].rolling(vol_len).median() - vol_df[f'vol_mean{window}']) / vol_df[f'vol_std{window}']
        vol_col.append(f'vol_z{window}')
    
    vol_z_df = vol_df[vol_col]
    
    return vol_z_df


In [7]:
def eucli_sort_print(df, train_df, test_df, sort_num):
    for i in random_lst:
        # 마지막 행 날짜 추출
        globals()[f"last_date{i}"] = df.dropna().tail(end).index[i]
        # 마지막 행 데이터 추출
        globals()[f"last_row{i}"] = train_df.loc[eval(f"last_date{i}")]
        # 가장 마지막 행과 다른 행들 간의 유클리디안 디스턴스 계산
        globals()[f"euclidean_distances{i}"] = train_df.apply(lambda row: distance.euclidean(row, eval(f"last_row{i}")), axis=1)  
        globals()[f"eucli{i}"] = eval(f"euclidean_distances{i}")
        # sorting 하기
        globals()[f"eucli_sort{i}"] = eval(f"eucli{i}.sort_values()")
        # sort_num 만큼 sorting 하기
        globals()[f"eucli_sort_num{i}"] = eval(f"eucli_sort{i}.head(sort_num)")

#### T0 시점에서, 성과와 거래량을 어떻게 쓰기좋은 데이터로 변환하는가? 

##### price

1) price: log(PXt0/PXt-20)/20, log(PXt0/PXt-40)/40, log(PXt0/PXt-60)/60 for price,  
2) volume: mean(Volt0-Volt-20), mean(Volt0-volT-40), mean(Volt0-Volt-60) for volume

In [8]:
ticker = "^IXIC"
period = "max" # period: max, 1y, 5y etc.

nasdaq = get_fin_data(ticker, period)
nasdaq

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,price,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1971-02-05,100.000000,0
1971-02-08,100.839996,0
1971-02-09,100.760002,0
1971-02-10,100.690002,0
1971-02-11,101.449997,0
...,...,...
2023-05-22,12720.780273,4278920000
2023-05-23,12560.250000,4347440000
2023-05-24,12484.160156,359790000
2023-05-25,12698.089844,4651640000


In [9]:
# parameters
today = nasdaq.index[-1]#.date()
year = today - relativedelta(years=20)
# year20 = today - timedelta(days=5000)
# nasdaq = nasdaq.loc[nasdaq.index >= '2000-01-01']
window_sizes = [20, 40, 60, 80, 100]
future = [10, 20, 30, 40] # future 이후 수익률
sort_num = 30 # 유클리디안 sorting 할 길이

In [10]:
# window size 과거 대비 수익률, future 만큼 미래 수익률
train_df, test_df = cal_return(nasdaq, window_sizes, future)

display(train_df)
test_df

Unnamed: 0_level_0,return_b20,return_b40,return_b60,return_b80,return_b100
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1971-02-05,,,,,
1971-02-08,,,,,
1971-02-09,,,,,
1971-02-10,,,,,
1971-02-11,,,,,
...,...,...,...,...,...
2023-05-22,0.002762,0.001828,0.001834,0.001248,0.002059
2023-05-23,0.003125,0.001627,0.001518,0.000971,0.002068
2023-05-24,0.002588,0.001587,0.001433,0.001142,0.001752
2023-05-25,0.002238,0.001568,0.001827,0.001147,0.001933


Unnamed: 0_level_0,return_a10,return_a20,return_a30,return_a40
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1971-02-05,-0.003200,0.042300,0.053700,0.071700
1971-02-08,-0.011107,0.035403,0.042344,0.066640
1971-02-09,-0.001191,0.033545,0.039103,0.072449
1971-02-10,0.005363,0.035455,0.038832,0.076373
1971-02-11,-0.001084,0.030064,0.035683,0.073140
...,...,...,...,...
2023-05-22,,,,
2023-05-23,,,,
2023-05-24,,,,
2023-05-25,,,,


##### volume

In [11]:
scale = [100, 200]
vol_len = 5
# scale 이동평균 대비 거래량 z-score 구하기
nasdaq_vol_df = vol_zscore(nasdaq, vol_len, scale)
nasdaq_vol_df

Unnamed: 0_level_0,vol_z100,vol_z200
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1971-02-05,,
1971-02-08,,
1971-02-09,,
1971-02-10,,
1971-02-11,,
...,...,...
2023-05-22,-1.107032,-0.868880
2023-05-23,-1.023263,-0.767167
2023-05-24,-0.879488,-0.756335
2023-05-25,-0.894689,-0.752589


In [12]:
# train_df에 거래량 feature도 추가 
train_df = pd.merge(train_df, nasdaq_vol_df, left_index=True, right_index=True)
train_df

Unnamed: 0_level_0,return_b20,return_b40,return_b60,return_b80,return_b100,vol_z100,vol_z200
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1971-02-05,,,,,,,
1971-02-08,,,,,,,
1971-02-09,,,,,,,
1971-02-10,,,,,,,
1971-02-11,,,,,,,
...,...,...,...,...,...,...,...
2023-05-22,0.002762,0.001828,0.001834,0.001248,0.002059,-1.107032,-0.868880
2023-05-23,0.003125,0.001627,0.001518,0.000971,0.002068,-1.023263,-0.767167
2023-05-24,0.002588,0.001587,0.001433,0.001142,0.001752,-0.879488,-0.756335
2023-05-25,0.002238,0.001568,0.001827,0.001147,0.001933,-0.894689,-0.752589


##### 20년 데이터 뽑기

In [13]:
train_20df = train_df[year:]
# nasdaq_df_y.drop(labels="price", axis=1, inplace=True)

train_20df

Unnamed: 0_level_0,return_b20,return_b40,return_b60,return_b80,return_b100,vol_z100,vol_z200
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2003-05-27,0.003130,0.003201,0.002529,0.002039,0.001532,0.942808,0.659740
2003-05-28,0.003031,0.003830,0.002815,0.002105,0.001212,1.254475,0.949035
2003-05-29,0.003642,0.003884,0.003098,0.002172,0.001270,1.791165,1.459001
2003-05-30,0.004022,0.003333,0.003234,0.002505,0.001159,2.103854,1.806401
2003-06-02,0.002841,0.003254,0.003327,0.002509,0.001054,2.627064,2.382706
...,...,...,...,...,...,...,...
2023-05-22,0.002762,0.001828,0.001834,0.001248,0.002059,-1.107032,-0.868880
2023-05-23,0.003125,0.001627,0.001518,0.000971,0.002068,-1.023263,-0.767167
2023-05-24,0.002588,0.001587,0.001433,0.001142,0.001752,-0.879488,-0.756335
2023-05-25,0.002238,0.001568,0.001827,0.001147,0.001933,-0.894689,-0.752589


In [14]:
# 해당 날짜와 가장 가까운 인덱스 30개추출 abs distance기준
def extract_closest_indices(df, target_index, n=30):
    df_index = pd.to_datetime(df.index)
    distances = {}
    target_index = pd.to_datetime(target_index)
    
    if target_index not in df_index:
        target_index = df_index[df_index <= target_index][-1]
    
    target_value = df.loc[target_index].values[0]
    for index in df.index:
        if index != target_index and index > target_index:
            distance = abs(df.loc[index].values[0] - target_value)
            distances[index] = distance

    closest_indices = sorted(distances, key=distances.get)[:n]
    returns = []
    for index in closest_indices:
        if index + pd.DateOffset(days=10) in df.index:
            return_10 = (nasdaq.loc[index + pd.DateOffset(days=10), 'price'] - nasdaq.loc[index, 'price']) / nasdaq.loc[index, 'price']
        else:
            closest_date = df_index[df_index <= index + pd.DateOffset(days=10)][-1]
            return_10 = (nasdaq.loc[closest_date, 'price'] - nasdaq.loc[index, 'price']) / nasdaq.loc[index, 'price']
        
        if index + pd.DateOffset(days=20) in df.index:
            return_20 = (nasdaq.loc[index + pd.DateOffset(days=20), 'price'] - nasdaq.loc[index, 'price']) / nasdaq.loc[index, 'price']
        else:
            closest_date = df_index[df_index <= index + pd.DateOffset(days=20)][-1]
            return_20 = (nasdaq.loc[closest_date, 'price'] - nasdaq.loc[index, 'price']) / nasdaq.loc[index, 'price']
        
        if index + pd.DateOffset(days=30) in df.index:
            return_30 = (nasdaq.loc[index + pd.DateOffset(days=30), 'price'] - nasdaq.loc[index, 'price']) / nasdaq.loc[index, 'price']
        else:
            closest_date = df_index[df_index <= index + pd.DateOffset(days=30)][-1]
            return_30 = (nasdaq.loc[closest_date, 'price'] - nasdaq.loc[index, 'price']) / nasdaq.loc[index, 'price']
        
        returns.append([return_10, return_20, return_30])

    returns_df = pd.DataFrame(returns, columns=['Return 10', 'Return 20', 'Return 30'])
    mean = returns_df.mean()
    std = returns_df.std()

    return mean['Return 10'], std['Return 10'], mean['Return 20'], std['Return 20'], mean['Return 30'], std['Return 30']


In [15]:
extract_closest_indices(train_20df, '2022-01-01')

(0.014474159712647027,
 0.03595346035037832,
 0.020894153190863175,
 0.04615036281855985,
 0.030406811931159166,
 0.04966496791017634)

In [16]:
def calculate_returns(start_date, end_date, df):
    result_df = pd.DataFrame(columns=['current_date', 'return_10', 'return_20', 'return_30', 'mean_10', 'std_10', 'mean_20', 'std_20', 'mean_30', 'std_30'])
    
    df_index = pd.to_datetime(df.index)
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)
    
    if start_date not in df_index:
        closest_start_date = df_index[df_index <= start_date][-1]
        start_date = df_index[df_index == closest_start_date][0]
        
    if end_date not in df_index:
        closest_end_date = df_index[df_index >= end_date][0]
        end_date = df_index[df_index == closest_end_date][0]
    
    current_index = start_date

    while current_index <= end_date:
        mean_10, std_10, mean_20, std_20, mean_30, std_30 = extract_closest_indices(df, current_index, n=30)

        next_date_10 = current_index + pd.DateOffset(days=10)
        next_date_20 = current_index + pd.DateOffset(days=20)
        next_date_30 = current_index + pd.DateOffset(days=30)
        current_index = df_index[df_index >= current_index][0]
        
        if next_date_10 in df.index:
            return_10 = (df.loc[next_date_10, 'price'] - df.loc[current_index, 'price']) / df.loc[current_index, 'price']
        else:
            closest_date = df_index[df_index >= next_date_10][0]
            return_10 = (df.loc[closest_date, 'price'] - df.loc[current_index, 'price']) / df.loc[current_index, 'price']

        if next_date_20 in df.index:
            return_20 = (df.loc[next_date_20, 'price'] - df.loc[current_index, 'price']) / df.loc[current_index, 'price']
        else:
            closest_date = df_index[df_index >= next_date_20][0]
            return_20 = (df.loc[closest_date, 'price'] - df.loc[current_index, 'price']) / df.loc[current_index, 'price']

        if next_date_30 in df.index:
            return_30 = (df.loc[next_date_30, 'price'] - df.loc[current_index, 'price']) / df.loc[current_index, 'price']
        else:
            closest_date = df_index[df_index >= next_date_30][0]
            return_30 = (df.loc[closest_date, 'price'] - df.loc[current_index, 'price']) / df.loc[current_index, 'price']
        
        result_df.loc[current_index] = [current_index, return_10, return_20, return_30, mean_10, std_10, mean_20, std_20, mean_30, std_30]
        
        current_index += pd.DateOffset(days=1)
    

    return result_df


In [17]:
# 실제 10,20,30수익률, 추출값 10,20,30일 수익률 평균 표준편차 순으로 df 형성 함수
calculate_returns('2022-01-01', '2022-01-20', nasdaq)

Unnamed: 0,current_date,return_10,return_20,return_30,mean_10,std_10,mean_20,std_20,mean_30,std_30
2021-12-31,2021-12-31,-0.04488,-0.095299,-0.089811,-0.042986,0.027937,-0.06656,0.03408,-0.082228,0.035108
2022-01-03,2022-01-03,-0.042908,-0.130355,-0.100609,-0.042986,0.027937,-0.06656,0.03408,-0.082228,0.035108
2022-01-04,2022-01-04,-0.046661,-0.113142,-0.111626,-0.039636,0.02945,-0.061566,0.030786,-0.079589,0.035029
2022-01-05,2022-01-05,-0.039289,-0.103368,-0.066367,-0.039892,0.02925,-0.058884,0.030522,-0.079185,0.035256
2022-01-06,2022-01-06,-0.038059,-0.102033,-0.070632,-0.038667,0.031149,-0.056803,0.029593,-0.079606,0.035158
2022-01-07,2022-01-07,-0.028723,-0.105994,-0.061612,-0.039883,0.030404,-0.056647,0.029338,-0.08194,0.035861
2022-01-10,2022-01-10,-0.029173,-0.07845,-0.062047,-0.039883,0.030404,-0.056647,0.029338,-0.08194,0.035861
2022-01-11,2022-01-11,-0.091367,-0.060288,-0.063867,-0.035262,0.031453,-0.05537,0.031947,-0.08429,0.036139
2022-01-12,2022-01-12,-0.087782,-0.055463,-0.091994,-0.032357,0.029878,-0.055542,0.031961,-0.082581,0.036965
2022-01-13,2022-01-13,-0.064273,-0.026289,-0.06861,-0.029964,0.029626,-0.054819,0.032877,-0.082262,0.037131


#### 2021.01~2022.12 백테스트

In [34]:
# 실제 10,20,30수익률, 추출값 10,20,30일 수익률 평균 표준편차 순으로 df 형성 함수
backtest_df = calculate_returns('2021-01-01', '2022-12-30', nasdaq)
backtest_df

Unnamed: 0,current_date,return_10,return_20,return_30,mean_10,std_10,mean_20,std_20,mean_30,std_30
2020-12-31,2020-12-31,0.011495,0.044146,0.039967,0.014554,0.048190,0.004900,0.079237,0.014114,0.076857
2021-01-04,2021-01-04,0.026616,0.065556,0.055514,0.014554,0.048190,0.004900,0.079237,0.014114,0.076857
2021-01-05,2021-01-05,0.014006,0.063736,0.074794,0.009746,0.049546,-0.005296,0.076255,-0.014370,0.077591
2021-01-06,2021-01-06,0.035821,0.069483,0.087554,0.010000,0.057614,-0.006012,0.080403,-0.012608,0.075004
2021-01-07,2021-01-07,0.009925,0.015544,0.070416,0.009848,0.048355,0.006971,0.073432,0.018839,0.075862
...,...,...,...,...,...,...,...,...,...,...
2022-12-23,2022-12-23,-0.010562,0.047937,0.082545,0.033977,0.027831,0.065157,0.025722,0.080131,0.044427
2022-12-27,2022-12-27,0.003260,0.070116,0.097668,0.033977,0.027831,0.065157,0.025722,0.080131,0.044427
2022-12-28,2022-12-28,0.041354,0.086340,0.137901,0.032317,0.029086,0.059537,0.031481,0.073994,0.044073
2022-12-29,2022-12-29,0.015037,0.045707,0.087394,0.032553,0.028916,0.057785,0.033664,0.071273,0.044346


In [29]:
# # 새로운 컬럼 만들기
# backtest_df['result'] = np.NaN
# backtest_df

Unnamed: 0,current_date,return_10,return_20,return_30,mean_10,std_10,mean_20,std_20,mean_30,std_30,result
2020-12-31,2020-12-31,0.011495,0.044146,0.039967,0.014554,0.048190,0.004900,0.079237,0.014114,0.076857,
2021-01-04,2021-01-04,0.026616,0.065556,0.055514,0.014554,0.048190,0.004900,0.079237,0.014114,0.076857,
2021-01-05,2021-01-05,0.014006,0.063736,0.074794,0.009746,0.049546,-0.005296,0.076255,-0.014370,0.077591,
2021-01-06,2021-01-06,0.035821,0.069483,0.087554,0.010000,0.057614,-0.006012,0.080403,-0.012608,0.075004,
2021-01-07,2021-01-07,0.009925,0.015544,0.070416,0.009848,0.048355,0.006971,0.073432,0.018839,0.075862,
...,...,...,...,...,...,...,...,...,...,...,...
2022-12-23,2022-12-23,-0.010562,0.047937,0.082545,0.033977,0.027831,0.065157,0.025722,0.080131,0.044427,
2022-12-27,2022-12-27,0.003260,0.070116,0.097668,0.033977,0.027831,0.065157,0.025722,0.080131,0.044427,
2022-12-28,2022-12-28,0.041354,0.086340,0.137901,0.032317,0.029086,0.059537,0.031481,0.073994,0.044073,
2022-12-29,2022-12-29,0.015037,0.045707,0.087394,0.032553,0.028916,0.057785,0.033664,0.071273,0.044346,


In [23]:
# # 20일 후 수익률 결과 판정
# for index, row in backtest_df.iterrows():
#     mean = row['mean_20']
#     std = row['std_20']
#     real = row['return_20']
    
#     plus_std = mean + std
#     minus_std = mean - std
    
#     # range에 들어온 경우
#     if (mean <= plus_std) and (mean >= minus_std):
#         # 방향성도 맞는 경우
#         if (real * mean) > 0:
#             backtest_df.loc[index, 'result'] = "good"
#         # range 안이지만 방향성이 다른 경우
#         else:
#             backtest_df.loc[index, 'result'] = "inside"
#     else:
#         backtest_df.loc[index, 'result'] = "bad"
        
# backtest_df

Unnamed: 0,current_date,return_10,return_20,return_30,mean_10,std_10,mean_20,std_20,mean_30,std_30,result
2020-12-31,2020-12-31,0.011495,0.044146,0.039967,0.014554,0.048190,0.004900,0.079237,0.014114,0.076857,good
2021-01-04,2021-01-04,0.026616,0.065556,0.055514,0.014554,0.048190,0.004900,0.079237,0.014114,0.076857,good
2021-01-05,2021-01-05,0.014006,0.063736,0.074794,0.009746,0.049546,-0.005296,0.076255,-0.014370,0.077591,inside
2021-01-06,2021-01-06,0.035821,0.069483,0.087554,0.010000,0.057614,-0.006012,0.080403,-0.012608,0.075004,inside
2021-01-07,2021-01-07,0.009925,0.015544,0.070416,0.009848,0.048355,0.006971,0.073432,0.018839,0.075862,good
...,...,...,...,...,...,...,...,...,...,...,...
2022-12-23,2022-12-23,-0.010562,0.047937,0.082545,0.033977,0.027831,0.065157,0.025722,0.080131,0.044427,good
2022-12-27,2022-12-27,0.003260,0.070116,0.097668,0.033977,0.027831,0.065157,0.025722,0.080131,0.044427,good
2022-12-28,2022-12-28,0.041354,0.086340,0.137901,0.032317,0.029086,0.059537,0.031481,0.073994,0.044073,good
2022-12-29,2022-12-29,0.015037,0.045707,0.087394,0.032553,0.028916,0.057785,0.033664,0.071273,0.044346,good


In [26]:
# backtest_df.isnull().sum()

current_date    0
return_10       0
return_20       0
return_30       0
mean_10         0
std_10          0
mean_20         0
std_20          0
mean_30         0
std_30          0
result          0
dtype: int64

In [25]:
# backtest_df.groupby('result').size()

result
good      281
inside    223
dtype: int64

In [31]:
# # 20일 후 수익률 결과 판정22
# backtest_df['result'] = np.NaN

# for index, row in backtest_df.iterrows():
#     mean = row['mean_20']
#     std = row['std_20']
#     real = row['return_20']
    
#     plus_std = mean + std
#     minus_std = mean - std
    
#     # 표준편차가 5% 이상일 때
#     if std >= 0.05:
#         backtest_df.loc[index, 'result'] = "large_std"
#     else:
#         # range에 들어온 경우
#         if (mean <= plus_std) and (mean >= minus_std):
#             # 방향성도 맞는 경우
#             if (real * mean) > 0:
#                 backtest_df.loc[index, 'result'] = "good"
#             # range 안이지만 방향성이 다른 경우
#             else:
#                 backtest_df.loc[index, 'result'] = "inside"
#         else:
#             backtest_df.loc[index, 'result'] = "bad"
        
# backtest_df

Unnamed: 0,current_date,return_10,return_20,return_30,mean_10,std_10,mean_20,std_20,mean_30,std_30,result
2020-12-31,2020-12-31,0.011495,0.044146,0.039967,0.014554,0.048190,0.004900,0.079237,0.014114,0.076857,large_std
2021-01-04,2021-01-04,0.026616,0.065556,0.055514,0.014554,0.048190,0.004900,0.079237,0.014114,0.076857,large_std
2021-01-05,2021-01-05,0.014006,0.063736,0.074794,0.009746,0.049546,-0.005296,0.076255,-0.014370,0.077591,large_std
2021-01-06,2021-01-06,0.035821,0.069483,0.087554,0.010000,0.057614,-0.006012,0.080403,-0.012608,0.075004,large_std
2021-01-07,2021-01-07,0.009925,0.015544,0.070416,0.009848,0.048355,0.006971,0.073432,0.018839,0.075862,large_std
...,...,...,...,...,...,...,...,...,...,...,...
2022-12-23,2022-12-23,-0.010562,0.047937,0.082545,0.033977,0.027831,0.065157,0.025722,0.080131,0.044427,good
2022-12-27,2022-12-27,0.003260,0.070116,0.097668,0.033977,0.027831,0.065157,0.025722,0.080131,0.044427,good
2022-12-28,2022-12-28,0.041354,0.086340,0.137901,0.032317,0.029086,0.059537,0.031481,0.073994,0.044073,good
2022-12-29,2022-12-29,0.015037,0.045707,0.087394,0.032553,0.028916,0.057785,0.033664,0.071273,0.044346,good


In [32]:
# print(backtest_df.isnull().sum())
# backtest_df.groupby('result').size()

current_date    0
return_10       0
return_20       0
return_30       0
mean_10         0
std_10          0
mean_20         0
std_20          0
mean_30         0
std_30          0
result          0
dtype: int64


result
good         154
inside       103
large_std    247
dtype: int64

In [35]:
# 10, 20, 30일 후 수익률 결과 판정
backtest_df['result10'] = np.NaN
backtest_df['result20'] = np.NaN
backtest_df['result30'] = np.NaN
future = [10, 20, 30]

for index, row in backtest_df.iterrows():
    # plus_std, minus_std
    for window in future:
        locals()[f"mean{window}"] = row[f"mean_{window}"]
        locals()[f"std{window}"] = row[f"std_{window}"]
        locals()[f"real{window}"] = row[f"return_{window}"]
        locals()[f"plus_std{window}"] = eval(f"mean{window}") + eval(f"std{window}")
        locals()[f"minus_std{window}"] = eval(f"mean{window}") - eval(f"std{window}")
            # 표준편차가 5% 이상일 때
        if eval(f"std{window}") >= 0.05:
            backtest_df.loc[index, f"result{window}"] = "large_std"
        else:
            # range에 들어온 경우
            if (eval(f"mean{window}") <= eval(f"plus_std{window}")) and (eval(f"mean{window}") >= eval(f"minus_std{window}")):
                # 방향성도 맞는 경우
                if (eval(f"real{window}") * eval(f"mean{window}")) > 0:
                    backtest_df.loc[index, f"result{window}"] = "good"
                # range 안이지만 방향성이 다른 경우
                else:
                    backtest_df.loc[index, f"result{window}"] = "inside"
            else:
                backtest_df.loc[index, f"result{window}"] = "bad"
#     mean = row['mean']    
#     std = row['std']    
#     real = row['return']
#     plus_std = mean + std
#     minus_std = mean - std
    

        
backtest_df

Unnamed: 0,current_date,return_10,return_20,return_30,mean_10,std_10,mean_20,std_20,mean_30,std_30,result10,result20,result30
2020-12-31,2020-12-31,0.011495,0.044146,0.039967,0.014554,0.048190,0.004900,0.079237,0.014114,0.076857,good,large_std,large_std
2021-01-04,2021-01-04,0.026616,0.065556,0.055514,0.014554,0.048190,0.004900,0.079237,0.014114,0.076857,good,large_std,large_std
2021-01-05,2021-01-05,0.014006,0.063736,0.074794,0.009746,0.049546,-0.005296,0.076255,-0.014370,0.077591,good,large_std,large_std
2021-01-06,2021-01-06,0.035821,0.069483,0.087554,0.010000,0.057614,-0.006012,0.080403,-0.012608,0.075004,large_std,large_std,large_std
2021-01-07,2021-01-07,0.009925,0.015544,0.070416,0.009848,0.048355,0.006971,0.073432,0.018839,0.075862,good,large_std,large_std
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-23,2022-12-23,-0.010562,0.047937,0.082545,0.033977,0.027831,0.065157,0.025722,0.080131,0.044427,inside,good,good
2022-12-27,2022-12-27,0.003260,0.070116,0.097668,0.033977,0.027831,0.065157,0.025722,0.080131,0.044427,good,good,good
2022-12-28,2022-12-28,0.041354,0.086340,0.137901,0.032317,0.029086,0.059537,0.031481,0.073994,0.044073,good,good,good
2022-12-29,2022-12-29,0.015037,0.045707,0.087394,0.032553,0.028916,0.057785,0.033664,0.071273,0.044346,good,good,good


In [39]:
print(backtest_df.isnull().sum() ,"\n")
print(backtest_df.groupby('result10').size(),"\n")
print(backtest_df.groupby('result20').size(),"\n")
print(backtest_df.groupby('result30').size(),"\n")

current_date    0
return_10       0
return_20       0
return_30       0
mean_10         0
std_10          0
mean_20         0
std_20          0
mean_30         0
std_30          0
result10        0
result20        0
result30        0
dtype: int64 

result10
good         232
inside       246
large_std     26
dtype: int64 

result20
good         154
inside       103
large_std    247
dtype: int64 

result30
good         115
inside        32
large_std    357
dtype: int64 



In [44]:
# 모두 good 인것만
condition = (backtest_df['result10'] == "good") & (backtest_df['result20'] == "good") & (backtest_df['result30'] == "good")  
print(backtest_df[condition].shape[0])
backtest_df[condition]

44


Unnamed: 0,current_date,return_10,return_20,return_30,mean_10,std_10,mean_20,std_20,mean_30,std_30,result10,result20,result30
2021-02-09,2021-02-09,-0.009512,-0.029903,-0.043478,-0.008073,0.026001,-0.025186,0.044834,-0.025986,0.043751,good,good,good
2021-02-10,2021-02-10,-0.031453,-0.043925,-0.046711,-0.008478,0.027114,-0.022957,0.046473,-0.024505,0.048955,good,good,good
2021-02-11,2021-02-11,-0.03513,-0.073295,-0.040359,-0.008689,0.02496,-0.022671,0.047512,-0.019909,0.040307,good,good,good
2021-02-17,2021-02-17,-0.026971,-0.063848,-0.053722,-0.003881,0.023692,-0.015395,0.042243,-0.023792,0.048653,good,good,good
2021-07-02,2021-07-02,0.006415,0.003092,0.002851,0.002465,0.027355,0.01214,0.039297,0.023835,0.04542,good,good,good
2021-07-06,2021-07-06,0.000955,0.011822,0.001189,0.002465,0.027355,0.01214,0.039297,0.023835,0.04542,good,good,good
2021-07-13,2021-07-13,0.010856,0.000233,0.009444,0.00173,0.02875,0.013806,0.040847,0.023495,0.048284,good,good,good
2021-07-14,2021-07-14,0.013367,0.007944,0.012151,0.004144,0.028305,0.014695,0.041394,0.026827,0.049537,good,good,good
2021-08-30,2021-08-30,0.007103,-0.014537,-0.019384,0.00339,0.037098,-0.008411,0.046729,-0.00724,0.044364,good,good,good
2021-11-04,2021-11-04,-0.005424,-0.005965,-0.044865,-0.022372,0.025496,-0.035233,0.036515,-0.05615,0.048987,good,good,good
