In [1]:
import pandas as pd
import numpy as np
# from yahoo_fin import options
import yfinance as yf
from datetime import datetime, timedelta
from datetime import date
from dateutil.parser import parse # 데이트 형식 자동변환
from copy import copy
from scipy.spatial import distance
from dateutil.relativedelta import relativedelta

In [2]:
def inequal_neg_mae(y_true, y_pred):
    gamma = 0.95
    sign_y_true = np.sign(y_true)
    loss =  sign_y_true * np.where(
        sign_y_true * (y_pred - y_true/2) >0, 
        (1 - gamma) * sign_y_true * np.abs(y_pred - y_true),
        - y_pred + (1 - gamma/2) * y_true 
        ) 
    # capped_loss = np.where(loss > 0.05, 0.05, loss)
    return 1 * loss

In [3]:
def show_score(func):
    l_pred_val = np.linspace(-0.2, 0.2, 1000)
    l_high_val = -0.1 * np.ones(1000)
    l_str_val = - 0.05 * np.ones(1000)
    l_med_val =  0.1 * np.ones(1000)
    l_weak_val = 0.05 * np.ones(1000)
    l_val = [l_high_val, l_str_val, l_med_val, l_weak_val]
    l_cols = ['red', 'darkorange', 'limegreen', 'lightseagreen']
    l_names= ['true val = ' +str(i[0]) for i in l_val]

    fig = plt.subplots(figsize=(4.5,4))
    # ax.axis('off')
    plt.axvline(x=0, color='k', linewidth=1)
    plt.axhline(y=0, color='k', linewidth=1)
    plt.xlim(-0.15, 0.15)
    plt.ylim(-0.15, 0.15)
    for true_type_, col_, name_ in zip(l_val, l_cols, l_names):
        plt.plot(l_pred_val, func(true_type_, l_pred_val), color = col_, label=name_)
        plt.xlabel('predict', fontsize=20)
        plt.ylabel(func.__name__, fontsize=20)
    #     ax.plot(l_pred_val, inequal_neg_mae(-true_type_, l_pred_val), color = col_)
    plt.legend()
    plt.show()
    return

##### functions

In [4]:
# 주가, 거래량 데이터 받는 함수
def get_fin_data(ticker, period):
    # yahoo finance에서 데이터 불러오기
    df = pd.DataFrame(yf.download(tickers=ticker, period=period)[['Adj Close','Volume']])
    df.rename(columns = {'Adj Close':'price'},inplace=True)
    
    return df

In [5]:
# 과거 대비 수익률 데이터 & 미래 수익률 데이터
def cal_return(df, window_sizes, future):
    df_return =copy(df)
    train_col = [] # train data 들어갈 것
    test_col = [] # 실제 비교할 data에 들어갈 것
    
    # window 사이즈만큼 이동한 주가 데이터
    for window in window_sizes:
        df_return[f'p_b{window}'] = df['price'].shift(window)
    
    # 과거 대비 로그 평균 수익률 데이터 
    for window in window_sizes:
        df_return[f'return_b{window}'] = (np.log(df['price']) - np.log(df_return[f'p_b{window}'])) / window  
        train_col.append(f'return_b{window}')
        
    # 미래만큼 이동한 데이터
    for after in future:
        df_return[f'p_a{after}'] = df['price'].shift(-after)
        
    # 미래 일반(로그x) 수익률
    for after in future:
        df_return[f'return_a{after}'] = (df_return[f'p_a{after}']) / (df_return['price']) -1
        test_col.append(f'return_a{after}')
    
    train_df = df_return[train_col]
    test_df = df_return[test_col]
    
    return train_df, test_df

In [6]:
# 거래량 z-score 구하기
# (data - mean(data, axis=0)) / std(data, axis=0)
def vol_zscore(df, vol_len, scale):
    vol_df = copy(df)
    vol_col = []
    for window in scale:
        vol_df[f'vol_mean{window}'] = df['Volume'].rolling(window).mean()
        vol_df[f'vol_std{window}'] = df['Volume'].rolling(window).std()
        vol_df[f'vol_z{window}'] = (df['Volume'].rolling(vol_len).median() - vol_df[f'vol_mean{window}']) / vol_df[f'vol_std{window}']
        vol_col.append(f'vol_z{window}')
    
    vol_z_df = vol_df[vol_col]
    
    return vol_z_df


In [7]:
def eucli_sort_print(df, train_df, test_df, sort_num):
    for i in random_lst:
        # 마지막 행 날짜 추출
        globals()[f"last_date{i}"] = df.dropna().tail(end).index[i]
        # 마지막 행 데이터 추출
        globals()[f"last_row{i}"] = train_df.loc[eval(f"last_date{i}")]
        # 가장 마지막 행과 다른 행들 간의 유클리디안 디스턴스 계산
        globals()[f"euclidean_distances{i}"] = train_df.apply(lambda row: distance.euclidean(row, eval(f"last_row{i}")), axis=1)  
        globals()[f"eucli{i}"] = eval(f"euclidean_distances{i}")
        # sorting 하기
        globals()[f"eucli_sort{i}"] = eval(f"eucli{i}.sort_values()")
        # sort_num 만큼 sorting 하기
        globals()[f"eucli_sort_num{i}"] = eval(f"eucli_sort{i}.head(sort_num)")

#### T0 시점에서, 성과와 거래량을 어떻게 쓰기좋은 데이터로 변환하는가? 

##### price

1) price: log(PXt0/PXt-20)/20, log(PXt0/PXt-40)/40, log(PXt0/PXt-60)/60 for price,  
2) volume: mean(Volt0-Volt-20), mean(Volt0-volT-40), mean(Volt0-Volt-60) for volume

In [8]:
ticker = "^IXIC"
period = "max" # period: max, 1y, 5y etc.

nasdaq = get_fin_data(ticker, period)
nasdaq

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,price,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1971-02-05,100.000000,0
1971-02-08,100.839996,0
1971-02-09,100.760002,0
1971-02-10,100.690002,0
1971-02-11,101.449997,0
...,...,...
2023-05-15,12365.209961,3979290000
2023-05-16,12343.049805,4067510000
2023-05-17,12500.570312,4501820000
2023-05-18,12688.839844,4532890000


In [9]:
# parameters
today = nasdaq.index[-1]#.date()
year = today - relativedelta(years=20)
# year20 = today - timedelta(days=5000)
# nasdaq = nasdaq.loc[nasdaq.index >= '2000-01-01']
window_sizes = [20, 40, 60, 80, 100]
future = [10, 20, 30, 40] # future 이후 수익률
sort_num = 30 # 유클리디안 sorting 할 길이

In [10]:
# window size 과거 대비 수익률, future 만큼 미래 수익률
train_df, test_df = cal_return(nasdaq, window_sizes, future)

display(train_df)
test_df

Unnamed: 0_level_0,return_b20,return_b40,return_b60,return_b80,return_b100
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1971-02-05,,,,,
1971-02-08,,,,,
1971-02-09,,,,,
1971-02-10,,,,,
1971-02-11,,,,,
...,...,...,...,...,...
2023-05-15,0.000846,0.001531,0.000701,0.001631,0.001591
2023-05-16,0.000774,0.001390,0.000768,0.001281,0.001572
2023-05-17,0.001393,0.001315,0.001402,0.001191,0.001547
2023-05-18,0.002543,0.002093,0.001629,0.001411,0.001916


Unnamed: 0_level_0,return_a10,return_a20,return_a30,return_a40
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1971-02-05,-0.003200,0.042300,0.053700,0.071700
1971-02-08,-0.011107,0.035403,0.042344,0.066640
1971-02-09,-0.001191,0.033545,0.039103,0.072449
1971-02-10,0.005363,0.035455,0.038832,0.076373
1971-02-11,-0.001084,0.030064,0.035683,0.073140
...,...,...,...,...
2023-05-15,,,,
2023-05-16,,,,
2023-05-17,,,,
2023-05-18,,,,


##### volume

In [11]:
scale = [100, 200]
vol_len = 5
# scale 이동평균 대비 거래량 z-score 구하기
nasdaq_vol_df = vol_zscore(nasdaq, vol_len, scale)
nasdaq_vol_df

Unnamed: 0_level_0,vol_z100,vol_z200
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1971-02-05,,
1971-02-08,,
1971-02-09,,
1971-02-10,,
1971-02-11,,
...,...,...
2023-05-15,-1.260264,-1.056161
2023-05-16,-1.239836,-1.046435
2023-05-17,-1.242831,-1.047692
2023-05-18,-1.231214,-1.045185


In [12]:
# train_df에 거래량 feature도 추가 
train_df = pd.merge(train_df, nasdaq_vol_df, left_index=True, right_index=True)
train_df

Unnamed: 0_level_0,return_b20,return_b40,return_b60,return_b80,return_b100,vol_z100,vol_z200
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1971-02-05,,,,,,,
1971-02-08,,,,,,,
1971-02-09,,,,,,,
1971-02-10,,,,,,,
1971-02-11,,,,,,,
...,...,...,...,...,...,...,...
2023-05-15,0.000846,0.001531,0.000701,0.001631,0.001591,-1.260264,-1.056161
2023-05-16,0.000774,0.001390,0.000768,0.001281,0.001572,-1.239836,-1.046435
2023-05-17,0.001393,0.001315,0.001402,0.001191,0.001547,-1.242831,-1.047692
2023-05-18,0.002543,0.002093,0.001629,0.001411,0.001916,-1.231214,-1.045185


##### 20년 데이터 뽑기

In [13]:
train_20df = train_df[year:]
# nasdaq_df_y.drop(labels="price", axis=1, inplace=True)

train_20df

Unnamed: 0_level_0,return_b20,return_b40,return_b60,return_b80,return_b100,vol_z100,vol_z200
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2003-05-19,0.002345,0.001217,0.001688,0.000907,0.000773,1.430488,1.120602
2003-05-20,0.001350,0.002121,0.002001,0.001316,0.000829,1.340528,0.990653
2003-05-21,0.000802,0.001716,0.001905,0.001463,0.000854,0.996525,0.675817
2003-05-22,0.001697,0.002075,0.002422,0.001452,0.001116,0.987466,0.669566
2003-05-23,0.002566,0.002175,0.002193,0.001326,0.001198,0.916527,0.613600
...,...,...,...,...,...,...,...
2023-05-15,0.000846,0.001531,0.000701,0.001631,0.001591,-1.260264,-1.056161
2023-05-16,0.000774,0.001390,0.000768,0.001281,0.001572,-1.239836,-1.046435
2023-05-17,0.001393,0.001315,0.001402,0.001191,0.001547,-1.242831,-1.047692
2023-05-18,0.002543,0.002093,0.001629,0.001411,0.001916,-1.231214,-1.045185


In [14]:
# 해당 날짜와 가장 가까운 인덱스 30개추출 abs distance기준
def extract_closest_indices(df, target_index, n=30):
    df_index = pd.to_datetime(df.index)
    distances = {}
    target_index = pd.to_datetime(target_index)
    
    if target_index not in df_index:
        target_index = df_index[df_index <= target_index][-1]
    
    target_value = df.loc[target_index].values[0]
    for index in df.index:
        if index != target_index and index > target_index:
            distance = abs(df.loc[index].values[0] - target_value)
            distances[index] = distance

    closest_indices = sorted(distances, key=distances.get)[:n]
    returns = []
    for index in closest_indices:
        if index + pd.DateOffset(days=10) in df.index:
            return_10 = (nasdaq.loc[index + pd.DateOffset(days=10), 'price'] - nasdaq.loc[index, 'price']) / nasdaq.loc[index, 'price']
        else:
            closest_date = df_index[df_index <= index + pd.DateOffset(days=10)][-1]
            return_10 = (nasdaq.loc[closest_date, 'price'] - nasdaq.loc[index, 'price']) / nasdaq.loc[index, 'price']
        
        if index + pd.DateOffset(days=20) in df.index:
            return_20 = (nasdaq.loc[index + pd.DateOffset(days=20), 'price'] - nasdaq.loc[index, 'price']) / nasdaq.loc[index, 'price']
        else:
            closest_date = df_index[df_index <= index + pd.DateOffset(days=20)][-1]
            return_20 = (nasdaq.loc[closest_date, 'price'] - nasdaq.loc[index, 'price']) / nasdaq.loc[index, 'price']
        
        if index + pd.DateOffset(days=30) in df.index:
            return_30 = (nasdaq.loc[index + pd.DateOffset(days=30), 'price'] - nasdaq.loc[index, 'price']) / nasdaq.loc[index, 'price']
        else:
            closest_date = df_index[df_index <= index + pd.DateOffset(days=30)][-1]
            return_30 = (nasdaq.loc[closest_date, 'price'] - nasdaq.loc[index, 'price']) / nasdaq.loc[index, 'price']
        
        returns.append([return_10, return_20, return_30])

    returns_df = pd.DataFrame(returns, columns=['Return 10', 'Return 20', 'Return 30'])
    mean = returns_df.mean()
    std = returns_df.std()

    return mean['Return 10'], std['Return 10'], mean['Return 20'], std['Return 20'], mean['Return 30'], std['Return 30']


In [15]:
extract_closest_indices(train_20df, '2022-01-01')

(0.0133074219342758,
 0.035179160590634896,
 0.01543615898903014,
 0.0430088012045397,
 0.021920759968938882,
 0.045311611687463955)

In [16]:
def calculate_returns(start_date, end_date, df):
    result_df = pd.DataFrame(columns=['current_date', 'return_10', 'return_20', 'return_30', 'mean_10', 'std_10', 'mean_20', 'std_20', 'mean_30', 'std_30'])
    
    df_index = pd.to_datetime(df.index)
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)
    
    if start_date not in df_index:
        closest_start_date = df_index[df_index <= start_date][-1]
        start_date = df_index[df_index == closest_start_date][0]
        
    if end_date not in df_index:
        closest_end_date = df_index[df_index >= end_date][0]
        end_date = df_index[df_index == closest_end_date][0]
    
    current_index = start_date

    while current_index <= end_date:
        mean_10, std_10, mean_20, std_20, mean_30, std_30 = extract_closest_indices(df, current_index, n=30)

        next_date_10 = current_index + pd.DateOffset(days=10)
        next_date_20 = current_index + pd.DateOffset(days=20)
        next_date_30 = current_index + pd.DateOffset(days=30)
        current_index = df_index[df_index >= current_index][0]
        
        if next_date_10 in df.index:
            return_10 = (df.loc[next_date_10, 'price'] - df.loc[current_index, 'price']) / df.loc[current_index, 'price']
        else:
            closest_date = df_index[df_index >= next_date_10][0]
            return_10 = (df.loc[closest_date, 'price'] - df.loc[current_index, 'price']) / df.loc[current_index, 'price']

        if next_date_20 in df.index:
            return_20 = (df.loc[next_date_20, 'price'] - df.loc[current_index, 'price']) / df.loc[current_index, 'price']
        else:
            closest_date = df_index[df_index >= next_date_20][0]
            return_20 = (df.loc[closest_date, 'price'] - df.loc[current_index, 'price']) / df.loc[current_index, 'price']

        if next_date_30 in df.index:
            return_30 = (df.loc[next_date_30, 'price'] - df.loc[current_index, 'price']) / df.loc[current_index, 'price']
        else:
            closest_date = df_index[df_index >= next_date_30][0]
            return_30 = (df.loc[closest_date, 'price'] - df.loc[current_index, 'price']) / df.loc[current_index, 'price']
        
        result_df.loc[current_index] = [current_index, return_10, return_20, return_30, mean_10, std_10, mean_20, std_20, mean_30, std_30]
        
        current_index += pd.DateOffset(days=1)
    

    return result_df


In [17]:
# 실제 10,20,30수익률, 추출값 10,20,30일 수익률 평균 표준편차 순으로 df 형성 함수
calculate_returns('2022-01-01', '2022-01-20', nasdaq)

Unnamed: 0,current_date,return_10,return_20,return_30,mean_10,std_10,mean_20,std_20,mean_30,std_30
2021-12-31,2021-12-31,-0.04488,-0.095299,-0.089811,-0.042986,0.027937,-0.06656,0.03408,-0.082228,0.035108
2022-01-03,2022-01-03,-0.042908,-0.130355,-0.100609,-0.042986,0.027937,-0.06656,0.03408,-0.082228,0.035108
2022-01-04,2022-01-04,-0.046661,-0.113142,-0.111626,-0.039636,0.02945,-0.061566,0.030786,-0.079589,0.035029
2022-01-05,2022-01-05,-0.039289,-0.103368,-0.066367,-0.039892,0.02925,-0.058884,0.030522,-0.079185,0.035256
2022-01-06,2022-01-06,-0.038059,-0.102033,-0.070632,-0.038667,0.031149,-0.056803,0.029593,-0.079606,0.035158
2022-01-07,2022-01-07,-0.028723,-0.105994,-0.061612,-0.039883,0.030404,-0.056647,0.029338,-0.08194,0.035861
2022-01-10,2022-01-10,-0.029173,-0.07845,-0.062047,-0.039883,0.030404,-0.056647,0.029338,-0.08194,0.035861
2022-01-11,2022-01-11,-0.091367,-0.060288,-0.063867,-0.035262,0.031453,-0.05537,0.031947,-0.08429,0.036139
2022-01-12,2022-01-12,-0.087782,-0.055463,-0.091994,-0.032357,0.029878,-0.055542,0.031961,-0.082581,0.036965
2022-01-13,2022-01-13,-0.064273,-0.026289,-0.06861,-0.029964,0.029626,-0.054819,0.032877,-0.082262,0.037131


230515 랩미팅
1. 의미있게 나온 날짜가 몇개인지 & 언젠지: 의미있는 기준: 방향성 & 표준편차 참고  
2. sort_num을 30개로 했는데 10개 등등 여러 파라미터 시도 해보기.(10,20,30)  
3. 점끼리 말고 기간 비교,,,
4. 가장 가깝다고 하는 애들이랑 그래프 비교해보기(경로 비교)  
5. 할 수 있으면 무엇 때문에 표준편차가 커지는지 찾아보기  
6. 초기에 한 모델로 가는게 맞나나ㅏㅏㅏ 여쭤보기  
7. 거래량 이동평균 말고 z-score로 바꾸기  
8. daily 수익률 말고 3일 이동평균 수익률이 좋을 수도 있다.  

-코멘트 달기  
-df인지 시리즈인지 등 변수명에 표시해주기   
def test_today # 이런식으로 함수를 여러개 만들어 놓기 -> 스크립트로 하지 x.  
-데이터 불러오는 함수, 엔지니어링 함수(20,40,60 etc.), test_today, 최종 백테스트 등의 함수.   
-과거랑 비교할 때 어느 것을 점수줄지 생각해보기. 같은 방향이면 적게 차이 주거나 etc.  
-L1, L2(유클리디안) 같은 것도 다 고려해보면 좋다.  
-백테스트 과거 전체 해보기.  
-표준편차  
-할 것: 10,20,30,40,50일 상위 sorting 해보기  
-4월 17일만 하지 말고, 이 외에 그냥 10개 선정해서 해보기.  