In [93]:
# 산출속도 향상을 위해 dataframe을 dict or list 구조로 변경하여 처리함

import sys
import pandas as pd
import numpy as np
import time, datetime


t0 = time.time()

#입력포맷
my_input = {'start_date' : '2018-01-01',
            'init_index' : 1000,
            'init_amt' : 1000000,
            'portfolio' : ['069500','261220','332940','371450','HERO','SPY'],
            'rebalancing' : {
                '2018-01-01' : [15,15,20,25,15,10],
                '2019-01-01' : [15,15,20,25,15,10],
                '2020-01-01' : [15,15,20,25,15,10],
                '2021-01-01' : [15,15,20,25,15,10]}                
            }

prc_list = []
for item in my_input['portfolio'] :
    # excel 파일 읽기
    infile = 'price_' + item + '.xlsx'
    df = pd.read_excel(infile,dtype='unicode')
    
    # timeseries로 변환
    df.set_index('DATE', inplace=True)
    df.index = pd.to_datetime(df.index)
    
    # 컬럼명 'PRICE'를 symbol code로 바꾼다
    df.rename(columns={'PRICE':item},inplace=True)
    df[item] = df[item].astype(float)
    prc_list.append(df)

t1 = time.time()

# 벤치마크 설정
infile = "benchmark.xlsx"
tmp_bm_df = pd.read_excel(infile,dtype='unicode')
    
# timeseries로 변환
tmp_bm_df.set_index('DATE', inplace=True)
tmp_bm_df.index = pd.to_datetime(tmp_bm_df.index)
# 컬럼명 'BM'으로 변환
tmp_bm_df.rename(columns={'PRICE':'BM'},inplace=True)
tmp_bm_df['BM'] = tmp_bm_df['BM'].astype(float)

# 벤치마크를 prc_list에 넣는 이유는, 날짜별로 발생할 수 있는 결측치를 없애기 위해서임.
# 예를들어 한국종목으로 포트폴리오를 구성하고, BM을 S&P 500으로 설정한다면, 분석시 데이터 처리가 번거로워진다
prc_list.append(tmp_bm_df)


# 하나의 DATAFRAME으로 합친다
prc_df = pd.concat(prc_list,join='outer',axis=1)

# 결측값(N/A)을 처리한다. method='pad'를 넣으면 N/A부분을 직전값으로 대체한다. 
# 상장이 늦게된종목의 경우 앞에서 부터 계속 N/A이므로 결측값의 대체가 어렵다 (나중에 0으로 바꿀지 고민해보자)
prc_df.fillna(method='pad',inplace=True)

# prc_df에서 벤치마크와 포트폴리오 시계열을 분리한다

bm_series = prc_df['BM']  # Series 로 변환됨
bm_df = bm_series.to_frame(name='BM') # Datafram으로 바꾸고...
#만약 벤치마크 앞쪽데이터가 없을 경우엔 데이터가 있는 날짜의 데이터로 채운다.
#시계열 분석시 직전 데이터로 채워야하나(method='pad'), 힘빼지말자...
bm_df.fillna(method='bfill',inplace=True)
#벤치마크를 지수와 같은 스케일로 변환한다.
ratio = my_input['init_index'] / bm_df.iloc[0,0]
bm_df['BM1000'] = bm_df['BM'] * ratio

# 시계열에서 BM 삭제
prc_df.drop(['BM'],axis=1,inplace=True)

# EXCEL에 쓰기 
# df.index = df.index.date은 concat 후에 변환하자. concat 전에 하면, 늦게 상장한 종목부터 합쳐진다
# prc_df.index = prc_df.index.date

# 수익률 구하기
rtn_df = prc_df.pct_change()
rtn_df.fillna(0,inplace=True) # 첫날의 수익률은 0으로 SET


# 초기비중설정 (리벨런싱 반영)
init_wght_df = pd.DataFrame(data=None, columns=rtn_df.columns, index=rtn_df.index)
for key, value in my_input['rebalancing'].items() :
    myindex = init_wght_df.index[init_wght_df.index.get_loc(key,method='bfill')]
    init_wght_df.loc[myindex] = [ x / 100 for x in value ]
    
t2 = time.time()

# 지수 백테스팅

## 초기값 세팅
my_index = my_input['init_index']
my_deposit = my_input['init_amt']

tot_rtn_list = []
my_index_list = []
my_deposit_list = []

##빠른 loop처리를 위해 dict로 변환 후 iteration 작업 (df -> dict -> list)
rtn_dict = rtn_df.to_dict('split')
rtns = rtn_dict['data']

wght_dict = init_wght_df.to_dict('split')
wghts = wght_dict['data']

## 지수 계산 시작
for i in range(0,len(rtns)) :
    tot_rtn = sum([ x*y for (x,y) in zip(rtns[i], wghts[i]) ]) / sum(wghts[i])
    my_index = my_index * (1+tot_rtn)
    my_deposit = my_deposit * (1+tot_rtn)
    
    tot_rtn_list.append(tot_rtn)
    my_index_list.append(my_index)
    my_deposit_list.append(my_deposit)
    
    if i < ( len(rtns) -1 ) :
        if  np.isnan(wghts[i+1]).any() :
            wghts[i+1] = [ (1+x)*y for (x,y) in zip(rtns[i],wghts[i])]
        else :
            print (wghts[i+1])

index_df = pd.DataFrame({'INDEX':my_index_list, 'TOT_RTN':tot_rtn_list, 'DEPOSIT':my_deposit_list},index=rtn_df.index)
weight_df =  pd.DataFrame(data=wghts, columns=init_wght_df.columns, index=init_wght_df.index)
t3 = time.time()

# 벤치마크 가져온 후, 지수일자와 맞추기




print("백테스트 완료!!")

elapsed_time = t1 - t0
print(f"종목별가격 로드 : {elapsed_time} sec")

elapsed_time = t2 - t1
print(f"수익률계산 및 비중 초기화 : {elapsed_time} sec")

elapsed_time = t3 - t2
print(f"지수 백테스트 : {elapsed_time} sec")

elapsed_time = t3 - t0
print(f"총 소요시간 : {elapsed_time} sec")


index_df.to_excel("myindex_2.xlsx")

[0.15, 0.15, 0.2, 0.25, 0.15, 0.1]
[0.15, 0.15, 0.2, 0.25, 0.15, 0.1]
[0.15, 0.15, 0.2, 0.25, 0.15, 0.1]
백테스트 완료!!
종목별가격 로드 : 0.17155146598815918 sec
수익률계산 및 비중 초기화 : 0.0818016529083252 sec
지수 백테스트 : 0.023905038833618164 sec
총 소요시간 : 0.27725815773010254 sec


In [94]:
ratio

3.056888698682481

In [96]:
index_df

Unnamed: 0_level_0,INDEX,TOT_RTN,DEPOSIT
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-02,1000.000000,0.000000,1.000000e+06
2018-01-03,1000.371254,0.000371,1.000371e+06
2018-01-04,1003.994530,0.003622,1.003995e+06
2018-01-05,1006.262039,0.002258,1.006262e+06
2018-01-08,1006.467584,0.000204,1.006468e+06
...,...,...,...
2021-04-05,1383.708404,0.004421,1.383708e+06
2021-04-06,1379.659944,-0.002926,1.379660e+06
2021-04-07,1380.325737,0.000483,1.380326e+06
2021-04-08,1381.001213,0.000489,1.381001e+06


In [36]:
infile = "benchmark.xlsx"
bm_df = pd.read_excel(infile,dtype='unicode')
    
# timeseries로 변환
bm_df.set_index('DATE', inplace=True)
bm_df.index = pd.to_datetime(bm_df.index)
bm_df.rename(columns={'PRICE':'BM'},inplace=True)
bm_df['BM'] = bm_df['BM'].astype(float)


In [37]:
# 하나의 DATAFRAME으로 합친다
result_df = pd.concat([index_df,bm_df],join='outer',axis=1)
result_df.fillna(method='pad',inplace=True)

In [38]:
bm_df['RTN'] = bm_df.pct_change()

#벤치마크를 지수와 같은 스케일로 변환한다.
ratio = my_input['init_index'] / bm_df.iloc[0,0]
bm_df['BM1000'] = bm_df['BM'] * ratio

In [39]:
bm_df

Unnamed: 0_level_0,BM,RTN,BM1000
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-02,326.00,,1000.000000
2018-01-03,327.13,0.003466,1003.466258
2018-01-04,324.58,-0.007795,995.644172
2018-01-05,328.97,0.013525,1009.110429
2018-01-08,331.28,0.007022,1016.196319
...,...,...,...
2021-04-05,424.85,0.002620,1303.220859
2021-04-06,425.88,0.002424,1306.380368
2021-04-07,426.70,0.001925,1308.895706
2021-04-08,426.86,0.000375,1309.386503


In [138]:
#입력포맷
my_input = {'start_date' : '2018-01-01',
            'init_amt' : 1000000,
            'portfolio' : ['069500','261220','332940','371450','HERO','SPY'],
            'rebalancing' : {
                '2018-01-01' : [15,15,20,25,15,10],
                '2019-01-01' : [15,15,20,25,15,10],
                '2020-01-01' : [15,15,20,25,15,10]
            }
                
            }

In [139]:
wght_t = pd.DataFrame(data=None, columns=rtn_df.columns, index=rtn_df.index)

for key, value in my_input['rebalancing'].items() :
    myindex = wght_t.index[wght_t.index.get_loc(key,method='bfill')]
    wght_t.loc[myindex] = [ x / 100 for x in value ]

In [140]:
wght_t.loc['2019']

Unnamed: 0_level_0,069500,261220,332940,371450,HERO,SPY
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-01-02,0.15,0.15,0.2,0.25,0.15,0.1
2019-01-03,,,,,,
2019-01-04,,,,,,
2019-01-07,,,,,,
2019-01-08,,,,,,
...,...,...,...,...,...,...
2019-12-24,,,,,,
2019-12-26,,,,,,
2019-12-27,,,,,,
2019-12-30,,,,,,


In [135]:
a

[1, 2, 3, 4, 1, 2, 3, 4]

In [81]:
wght_t.index

DatetimeIndex(['2018-01-02', '2018-01-03', '2018-01-04', '2018-01-05',
               '2018-01-08', '2018-01-09', '2018-01-10', '2018-01-11',
               '2018-01-12', '2018-01-15',
               ...
               '2021-03-29', '2021-03-30', '2021-03-31', '2021-04-01',
               '2021-04-02', '2021-04-05', '2021-04-06', '2021-04-07',
               '2021-04-08', '2021-04-09'],
              dtype='datetime64[ns]', name='DATE', length=848, freq=None)

In [83]:
dt = pd.to_datetime(a)

In [84]:
dt

Timestamp('2018-01-06 00:00:00')

In [85]:
a

'2018-01-06'

In [102]:
# 가장 가까운 다음 날짜(index) 찾기 예제

dt = '2018-01-06'

# method 옵션 - nearest :가장가까운놈, backfill/bfill: Next, pad/ffill : Previous
mm = wght_t.index.get_loc(dt,method='bfill')
find_index = wght_t.index[wght_t.index.get_loc(dt,method='bfill')]

In [103]:
find_index

Timestamp('2018-01-08 00:00:00')

In [68]:
wght_t.to_excel("tttt.xlsx")