In [1]:
!pip install pandas_market_calendars



In [2]:
import pandas as pd
import numpy as np

from datetime import datetime
import pandas_market_calendars as mcal

from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler

import time
import warnings

from tslearn.metrics import dtw

# Ignore all warnings
warnings.filterwarnings("ignore")

# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)

# pd.set_option('display.max_rows', 10)

In [3]:
class make_dataset:
    def __init__(self,df,df2):
        self.df = df
        self.df2 = df2
        self.df_dt = self.to_datetime_index(df)
        self.df2_dt = self.to_datetime_index(df2)
        self.df_dt = self.make_px(self.df_dt)
        self.df2_dt = self.make_px(self.df2_dt)

    def to_datetime_index(self, df):
        df['date'] = pd.to_datetime(df['date'])
        # 원본 df가 수정되지 않게 하기 위해 새로운 변수에 입력
        df_dt = df.set_index('date', inplace=False)
        return df_dt
    
    def make_px(self, df_dt):
        df_px = df_dt[['PX_LAST']]
        return df_px

In [4]:
# dataset에서 맨 앞에 NaN값 있을 경우 제거
# => def fill_date에서 씀
def get_first_valid_index(df, name='(티커 입력 바람!)'):
    first_valid_index = df['PX_LAST'].first_valid_index()

    # 에러 메시지
    if first_valid_index is None:
        print(f"{name}의 유효한 데이터가 없습니다.")
        return None

    print(f"{name}의 유효한 데이터 시작일은 {first_valid_index.strftime('%Y-%m-%d')}입니다.")
    return first_valid_index


def non_shared_holidays(market1, market2, start_date, end_date):
    start_date_str = start_date.strftime('%Y-%m-%d')
    end_date_str = end_date.strftime('%Y-%m-%d')
    market1_trading_day = market1.schedule(start_date=start_date, end_date=end_date)
    market2_trading_day = market2.schedule(start_date=start_date, end_date=end_date)
    non_shared_holidays_list = sorted(set(market1_trading_day.index) - set(market2_trading_day.index))
    return non_shared_holidays_list 


def compare_index(ks_df, rs_df, ks_start_date,ks_end_date,rs_start_date,rs_end_date):
    if ks_start_date > rs_start_date:
        start_date = ks_start_date
    else:
        start_date = rs_start_date

    if ks_end_date > rs_end_date:
        end_date = rs_end_date
    else:
        end_date = ks_end_date

    if set(ks_df.loc[start_date:end_date].index) == set(rs_df.loc[start_date:end_date].index):
        print('데이터셋의 전처리 후 날짜가 잘 맞습니다')
        return None

    else:
        print('데이터셋의 전처리 후 날짜가 잘 맞지 않습니다!!!!!!!')
        return set(ks_df.loc[start_date:end_date].index), set(rs_df.loc[start_date:end_date].index)    
    
    
# 결측치 채우기 (원래의 중간 행 결측치, 비 공유 휴장일)
def fill_na(df):
    df['PX_LAST'] = df['PX_LAST'].fillna(method='ffill')
    return df


def npdt(str_date):
    npdt_date = np.datetime64(str_date + 'T00:00:00.000000000')
    return npdt_date

In [5]:
class tlag_0_corr_dtw_rmse_len:
    def __init__(self, kospi_df, russell_df, ks_ticker, rs_ticker, timezone=-1):

        # 수정할 dataframe 생성
        self.ks_df = kospi_df
        self.rs_df = russell_df
        
        # ticker 받아오기
        self.ks_ticker = ks_ticker
        self.rs_ticker = rs_ticker
        
        # 기타 것들 받아오기
        self.timezone = timezone
        
        # dataframe 수정
        self.fill_date()
        
    # start, end 단어 사용
    def fill_date(self):
        # 시작일 구하기
        ks_start_date = get_first_valid_index(self.ks_df, self.ks_ticker)
        rs_start_date = get_first_valid_index(self.rs_df, self.rs_ticker)
        
        # 시작일로 자르기
        self.ks_df = self.ks_df.loc[ks_start_date:]
        self.rs_df = self.rs_df.loc[rs_start_date:]
        
        # 종료일 구하기
        ks_end_date = self.ks_df.index[-1]
        rs_end_date = self.rs_df.index[-1]
        
        # 한-미 시장의 주식을 비교한다면
        if self.timezone == -1:

            # 비 공유 휴장일 구하기
            krx = mcal.get_calendar('XKRX')
            nyse = mcal.get_calendar('NYSE')
            
            # 비 공유 휴장일 리스트 (fill_ks_index = 한국 휴장일, 미국 거래일)
            fill_ks_index = non_shared_holidays(nyse, krx, ks_start_date, ks_end_date)
            fill_rs_index = non_shared_holidays(krx, nyse, rs_start_date, rs_end_date)
            
            # 인덱스 채우기
            self.ks_df = self.ks_df.reindex(self.ks_df.index.union(fill_ks_index))
            self.rs_df = self.rs_df.reindex(self.rs_df.index.union(fill_rs_index))
            
       # 모든 거래일 '딕셔너리' 만들어놓기
        self.all_trading_day = {day: index for index, day in enumerate(sorted((set(self.ks_df.index)|set(self.rs_df.index))))}
        self.all_trading_idx = {index: day for index, day in enumerate(sorted((set(self.ks_df.index)|set(self.rs_df.index))))}     
        
        # 잘 채워졌나 인덱스 비교
        compare = compare_index(self.ks_df,self.rs_df,ks_start_date,ks_end_date,rs_start_date,rs_end_date)
        if compare is not None:
            if len(compare[0]-compare[1]) is not 0:
                fill_rs_index_2 = compare[0] - compare[1]
                print(f'{self.rs_ticker}에 {fill_rs_index_2}가 비어있습니다.')
                self.rs_df = self.rs_df.reindex(self.rs_df.index.union(fill_rs_index_2))
            if len(compare[1]-compare[0]) is not 0:
                fill_ks_index_2 = compare[1] - compare[0]
                print(f'{self.ks_ticker}에 {fill_ks_index_2}가 비어있습니다.')
                self.ks_df = self.ks_df.reindex(self.ks_df.index.union(fill_ks_index_2))
            print('맞지 않는 날짜를 채웠습니다!!!')
                                        
        self.ks_df = fill_na(self.ks_df)
        self.rs_df = fill_na(self.rs_df)
        
        # 자르기
        ks_start_idx = self.all_trading_day[ks_start_date]
        ks_end_idx = self.all_trading_day[ks_end_date]
        rs_start_idx = self.all_trading_day[rs_start_date]
        rs_end_idx = self.all_trading_day[rs_end_date]
        
        if ks_start_idx > rs_start_idx - self.timezone:
            rs_start_idx = ks_start_idx + self.timezone
        else:
            ks_start_idx = rs_start_idx - self.timezone

        if ks_end_idx + self.timezone > rs_end_idx:
            ks_end_idx = rs_end_date - self.timezone
        else:
            rs_end_idx = ks_end_idx + self.timezone
            
        ks_start_date = self.all_trading_idx[ks_start_idx]
        ks_end_date = self.all_trading_idx[ks_end_idx]
        rs_start_date = self.all_trading_idx[rs_start_idx]
        rs_end_date = self.all_trading_idx[rs_end_idx]
        
        self.ks_df = self.ks_df.loc[ks_start_date:ks_end_date]
        self.rs_df = self.rs_df.loc[rs_start_date:rs_end_date]
        
        self.ks_df['PX_LAST'] = np.array(self.ks_df['PX_LAST']) / self.ks_df['PX_LAST'][0]
        self.rs_df['PX_LAST'] = np.array(self.rs_df['PX_LAST']) / self.rs_df['PX_LAST'][0]
        
        print(f"각각 데이터셋의 길이는 {len(self.ks_df)}, {len(self.rs_df)} 입니다.")
        print(f"{self.ks_ticker} 데이터셋의 날짜는 {self.ks_df.index[0].strftime('%Y-%m-%d')} ~ {self.ks_df.index[-1].strftime('%Y-%m-%d')}로 설정됐습니다.")
        print(f"{self.rs_ticker} 데이터셋의 날짜는 {self.rs_df.index[0].strftime('%Y-%m-%d')} ~ {self.rs_df.index[-1].strftime('%Y-%m-%d')}로 설정됐습니다.")
        print()
        
    # first, last 단어 사용
    def get_corr_dtw_rmse_len(self):
        
        try:
            corr, p_value = pearsonr(self.ks_df['PX_LAST'], self.rs_df['PX_LAST'])
            if p_value > 0.05:
                corr = 0
        except:
            corr = np.nan
              
        try:
            distance = dtw(np.array(self.ks_df['PX_LAST']), np.array(self.rs_df['PX_LAST']))
        except:
            distance = np.nan
            
        try:
            rmse = np.sqrt(np.mean(np.array(self.ks_df['PX_LAST'])-np.array(self.rs_df['PX_LAST'])**2))
        except:
            rsme = np.nan
              
        return corr, distance, rmse, len(self.ks_df), len(self.rs_df)

In [6]:
path = '/Users/admin/Desktop/Life'

kospi_members_df = pd.read_csv(f'{path}/kospi200_sector.csv', encoding='cp949')
russell_members_df = pd.read_csv(f'{path}/russell3000_sector.csv', encoding='cp949')

# 섹터 리스트 -> kospi에 real estate 종목이 없음
sector_list = ['Health Care', 'Financials', 'Industrials', 'Information Technology', 'Consumer Discretionary', 'Real Estate', 'Energy', 'Materials', 'Communication Services', 'Consumer Staples', 'Utilities']
sector_kospi = ['Health Care', 'Financials', 'Industrials', 'Information Technology', 'Consumer Discretionary', 'Energy', 'Materials', 'Communication Services', 'Consumer Staples', 'Utilities']

In [2]:
for sector in sector_kospi[0:1]:
    
    all_pair_num = len(kospi_members_df[kospi_members_df['Sector']==sector]['Ticker_Bloomberg']) * len(russell_members_df[russell_members_df['Sector']==sector]['Ticker_Bloomberg'])
    
    i = 1
    
    result_list_for_df = []
    fail_list_for_df = []
    
    for kospi_ticker in kospi_members_df[kospi_members_df['Sector']==sector]['Ticker_Bloomberg']:
        for russell_ticker in russell_members_df[russell_members_df['Sector']==sector]['Ticker_Bloomberg']:
    
            start_time = time.time()

            df = pd.read_csv(f'{path}/datasets-total-ver23060812/datasets-KOSPI200/dataset-KOSPI200-{kospi_ticker}.csv')
            df2 = pd.read_csv(f'{path}/datasets-total-ver23060812/datasets-Russell3000/dataset-Russell3000-{russell_ticker}.csv')

            try:
                error_num = 1
                data = make_dataset(df, df2)

                error_num = 2
                pair = tlag_0_corr_dtw_rmse_len(data.df_dt, data.df2_dt, kospi_ticker, russell_ticker, timezone = -1)

                error_num = 3
                result = pair.get_corr_dtw_rmse_len()

                end_time = time.time()
                excution_time = end_time - start_time

                result_list = [kospi_ticker, russell_ticker, result[0], result[1], result[2], result[3], result[4]]
                result_list_for_df.append(result_list)
                print(f'({i}/{all_pair_num}) {kospi_ticker}|{russell_ticker}는 corr: {result[0]}, dtw: {result[1]}, rmse: {result[2]} 로 결과 기록됨')

                error_num = 0

            except:
                end_time = time.time()
                excution_time = end_time - start_time

                fail_list = [kospi_ticker, russell_ticker, error_num]
                fail_list_for_df.append(fail_list)
                print(f'({i}/{all_pair_num}) {kospi_ticker}|{russell_ticker}는 에러{error_num}로 인해 실패로 기록됨')


            print(excution_time)
            print()
            print('-'*50)
            print()
            i += 1

    df = pd.DataFrame(result_list_for_df, columns=['kospi_ticker', 'russell_ticker', 'corr', 'distance', 'rmse', 'ks_len', 'rs_len'])
    df2 = pd.DataFrame(fail_list_for_df, columns=['kospi_ticker', 'russell_ticker', 'error_num'])

    df.to_csv(f'detection_result/{sector}_get_corr_dtw.csv', index=None)
    df2.to_csv(f'detection_result/{sector}_get_corr_dtw_fail.csv', index=None)   

NameError: name 'sector_kospi' is not defined