In [14]:
import pandas as pd
import numpy as np
from functools import reduce

In [82]:
def replace_multiple(literal: str, replacements: dict) -> str:
    return reduce(lambda x, y: x.replace(y[0], y[1]), [literal, *replacements.items()])


def preprocess_raw(filepath: str, save_hdf=False):
    stock_data = pd.read_csv(filepath, sep=',')
    stock_data.columns = [old_n.lower() for old_n in stock_data.columns]
    stock_data.rename(columns={'price': 'close', 'vol.': 'vol', 'change %': 'pct_change'}, inplace=True)
    stock_data.date = pd.to_datetime(stock_data.date, infer_datetime_format=True)
    for col in ['close', 'open', 'high', 'low']:
        stock_data[col] = stock_data[col].apply(lambda x: float(x.replace(',', '')))
    stock_data['vol_mult'] = stock_data['vol'].apply(lambda x: x[-1])
    vol_mult_dict = {'K': 1e3, 'M': 1e6, 'B': 1e9}
    stock_data['vol'] = stock_data.apply(lambda x: float(replace_multiple(x['vol'], {'K': '', 'M': '', 'B': ''})) * vol_mult_dict.get(x['vol_mult']), axis=1)
    stock_data['pct_change'] = stock_data.apply(lambda x: float(x['pct_change'].replace('%', '')) * 1e-2, axis=1)
    stock_data.drop(columns='vol_mult', inplace=True)
    stock_data = stock_data.set_index('date')
    stock_data = stock_data.sort_index()
    stock_data = stock_data.reindex(columns=['open', 'high', 'low', 'close', 'vol'], )
    if save_hdf:
        stock_data.to_hdf(f'{filepath.split(".csv")[0]}.hdf', key='df')

    return stock_data

In [86]:
df = preprocess_raw('data/BTC_history.csv', save_hdf=True)
# df = pd.read_hdf('data/BTC_history.hdf')
df.tail()

Unnamed: 0_level_0,open,high,low,close,vol
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-03-13,38813.2,39272.3,37603.4,37792.4,1550000000.0
2022-03-14,37789.5,39914.3,37613.6,39671.1,2330000000.0
2022-03-15,39673.0,39854.7,38220.9,39285.7,2250000000.0
2022-03-16,39282.5,41701.6,38953.2,41118.7,4470000000.0
2022-03-17,41118.7,41406.0,40557.8,40691.9,3280000000.0


In [87]:
df.to_csv('data/BTC_history_n.csv', sep=',')