In [1]:
import os
from glob import glob
import pandas as pd
from tqdm import tqdm
from itertools import combinations
from sklearn.preprocessing import StandardScaler
import joblib
from common_utils import make_dirs

### Define functions

In [2]:
CONFIG = {
    'rawdata_dir': '../../../storage/dataset/rawdata/csv/',
    'data_store_dir': '../../../storage/dataset/dataset_10m/',
    "winsorize_threshold": 0.6,
    "lookahead_window": 10,
    'train_ratio': 0.7,
}
COLUMNS = ['open', 'high', 'low', 'close']


def load_rawdata(file_name):
    rawdata = pd.read_csv(file_name, header=0, index_col=0)[COLUMNS]
    rawdata.index = pd.to_datetime(rawdata.index)
    
    return rawdata

def _build_feature_by_rawdata(rawdata):
    returns = rawdata.pct_change(1, fill_method=None).iloc[1:].rename(
        columns={
            key: key + '_return'
            for key in COLUMNS
        }
    )

    inner_changes = []
    for column_pair in sorted(list(combinations(COLUMNS, 2))):
        inner_changes.append(rawdata[list(column_pair)].pct_change(1, axis=1, fill_method=None)[column_pair[-1]].rename('_'.join(column_pair) + '_change'))

    inner_changes = pd.concat(inner_changes, axis=1).reindex(returns.index)

    return pd.concat([returns, inner_changes], axis=1).sort_index()

def build_features(file_names):
    features = []
    for file_name in tqdm(file_names):
        coin_pair = file_name.split('/')[-1].split('.')[0]

        rawdata = load_rawdata(file_name=file_name)
        feature = _build_feature_by_rawdata(rawdata=rawdata)
        feature.columns = pd.MultiIndex.from_tuples(sorted([(coin_pair, column) for column in feature.columns]))

        features.append(feature)

    return pd.concat(features, axis=1).dropna()

def _build_fwd_returns_by_rawdata(rawdata, lookahead_window):
    fwd_returns = []
    for column_pair in [('close', 'high'), ('close', 'low')]:
        partial_fwd_returns = []
        for window in range(1, lookahead_window + 1):
            colum_pair_df = rawdata[list(column_pair)].copy().sort_index()
            colum_pair_df.columns = [0, 1]

            colum_pair_df[1] = colum_pair_df[1].shift(-window)
            partial_fwd_return = colum_pair_df.pct_change(1, axis=1, fill_method=None)[1].rename(f'fwd_return({window})')        
            partial_fwd_returns.append(partial_fwd_return)

        partial_fwd_returns = pd.concat(partial_fwd_returns, axis=1).sort_index()
        partial_fwd_returns.columns = ['_'.join(column_pair) + '_' + column for column in partial_fwd_returns.columns]
        fwd_returns.append(partial_fwd_returns)

    return pd.concat(fwd_returns, axis=1).sort_index()

def build_fwd_returns(file_names, lookahead_window):
    total_fwd_returns = []
    for file_name in tqdm(file_names):
        coin_pair = file_name.split('/')[-1].split('.')[0]

        rawdata = load_rawdata(file_name=file_name)
        fwd_returns = _build_fwd_returns_by_rawdata(rawdata=rawdata, lookahead_window=lookahead_window)
        fwd_returns.columns = pd.MultiIndex.from_tuples(sorted([(coin_pair, column) for column in fwd_returns.columns]))

        total_fwd_returns.append(fwd_returns)

    return pd.concat(total_fwd_returns, axis=1).dropna()

def build_scaler(features):
    scaler = StandardScaler()
    scaler.fit(features)
    
    return scaler

def preprocess_features(features, scaler, winsorize_threshold):
    index = features.index
    columns = features.columns

    processed_features = pd.DataFrame(scaler.transform(features), index=index, columns=columns)
    
    # winsorize
    return processed_features.clip(-winsorize_threshold, winsorize_threshold)

def main(
    rawdata_dir=CONFIG['rawdata_dir'],
    data_store_dir=CONFIG['data_store_dir'],
    winsorize_threshold=CONFIG["winsorize_threshold"],
    lookahead_window=CONFIG['lookahead_window'],
):
    # Make dirs
    train_data_store_dir = os.path.join(data_store_dir, 'train')
    test_data_store_dir = os.path.join(data_store_dir, 'test')
    make_dirs([data_store_dir, scaler_store_dir, train_data_store_dir, test_data_store_dir])

    # Build features
    file_names = glob(os.path.join(rawdata_dir, '*'))
    features = build_features(file_names)
    scaler = build_scaler(features)

    features = preprocess_features(features=features, scaler=scaler, winsorize_threshold=winsorize_threshold)

    # Store Artifacts
    boundary_index = int(len(features.index) * CONFIG['train_ratio'])
    features.iloc[:boundary_index].to_csv(
        os.path.join(train_data_store_dir, 'X.csv'))
    features.iloc[boundary_index:].to_csv(
        os.path.join(test_data_store_dir, 'X.csv'))

    joblib.dump(scaler, os.path.join(CONFIG['data_store_dir'], 'scaler.pkl'))

In [3]:
lookahead_window=CONFIG['lookahead_window']

In [4]:
file_names = glob(os.path.join(CONFIG['rawdata_dir'], '*'))
fwd_returns = build_fwd_returns(file_names, lookahead_window=lookahead_window)

100%|██████████| 30/30 [01:41<00:00,  3.38s/it]
