In [15]:
import pandas as pd

In [16]:
def read_bid_ask_data(ask_fname : str, bid_fname : str, lowercase_columns = False, set_time_index = False):
    """Reads and combines the bid and ask csv files of duksascopy historical market data, into a single OHLCV dataframe."""
    df_ask = pd.read_csv(ask_fname, infer_datetime_format=True)
    df_bid = pd.read_csv(bid_fname, infer_datetime_format=True)

    df_avg = (df_bid[["Open", "High", "Low", "Close", "Volume"]]+ df_ask[["Open", "High", "Low", "Close", "Volume"]]) / 2.0
    df_avg = df_avg[df_avg["Volume"] > 0].reset_index()    
    df_avg["time"] = df_ask["Local time"]
    df_avg["time"] = df_avg["time"].str.replace(r".\d{3} GMT[+-]\d\d\d\d", '', regex = True) ## Strip ms and GMT TZ in time column
    if "index" in list(df_avg):
        print("index column found in dataframe, so dropping them")
        df_avg.drop(labels = "index", axis = 1, inplace = True)

    if lowercase_columns:
        df_avg.columns= df_avg.columns.str.lower()
        
    if set_time_index:
        df_avg["time"] = pd.to_datetime(df_avg["time"],format='%d.%m.%Y %H:%M:%S')
        df_avg = df_avg.set_index("time")      
    return df_avg

In [17]:
### DataFrame Slicing based on nr. of rows on 1m dataframe
def slice_df_by_1m_rows(df : pd.DataFrame, nr_days_to_slice : int):
    """Slice the historical dataframe from most recent to the nr. of days specified"""
    mins_per_day = 24 * 60
    nr_days_to_slice = 365 * mins_per_day
    df = df.iloc[-nr_days_to_slice:].reset_index(drop = True)
    return df

In [18]:
## Specify FileNames of Bid / Ask data downloaded from DukaScopy
bid_ask_files = {
    "GBPUSD" : {"Bid": "GBPUSD_Candlestick_1_M_BID_01.12.2019-01.12.2022.csv",
                "Ask": "GBPUSD_Candlestick_1_M_ASK_01.12.2019-01.12.2022.csv"},
    "EURUSD" : {"Bid": "EURUSD_Candlestick_1_M_BID_01.12.2019-01.12.2022.csv",
                "Ask": "EURUSD_Candlestick_1_M_ASK_01.12.2019-01.12.2022.csv"},
    "AUDUSD" : {"Bid": "AUDUSD_Candlestick_1_M_BID_01.12.2019-01.12.2022.csv",
                "Ask": "AUDUSD_Candlestick_1_M_ASK_01.12.2019-01.12.2022.csv"},
    "USDCAD" : {"Bid": "USDCAD_Candlestick_1_M_BID_01.12.2019-01.12.2022.csv",
                "Ask": "USDCAD_Candlestick_1_M_ASK_01.12.2019-01.12.2022.csv"},
    "USDJPY" : {"Bid": "USDJPY_Candlestick_1_M_BID_01.12.2019-01.12.2022.csv",
                "Ask": "USDJPY_Candlestick_1_M_ASK_01.12.2019-01.12.2022.csv"}                                                                            
}

In [20]:
folder_path = "/Users/dilip.rajkumar/Documents/VectorBT_Pro_Study/Dukascopy_Historical_Data/"
symbol = "EURUSD"
ask_csv_file = folder_path + bid_ask_files[symbol]["Ask"]
bid_csv_file = folder_path + bid_ask_files[symbol]["Bid"]
df = read_bid_ask_data(ask_csv_file, bid_csv_file, set_time_index = True)
df

index column found in dataframe, so dropping them


Unnamed: 0_level_0,Open,High,Low,Close,Volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-12-02 03:29:00,1.102380,1.102515,1.102350,1.102510,9.935
2019-12-02 03:30:00,1.102510,1.102515,1.102465,1.102475,90.055
2019-12-02 03:31:00,1.102490,1.102720,1.102460,1.102695,98.770
2019-12-02 03:32:00,1.102695,1.102695,1.102675,1.102680,20.965
2019-12-02 03:33:00,1.102685,1.102685,1.102455,1.102455,46.140
...,...,...,...,...,...
2022-11-22 07:12:00,1.050770,1.050965,1.050770,1.050955,582.305
2022-11-22 07:13:00,1.050950,1.051095,1.050945,1.051030,372.185
2022-11-22 07:14:00,1.051025,1.051220,1.050990,1.051025,446.085
2022-11-22 07:15:00,1.051030,1.051100,1.050885,1.050910,647.470


In [None]:
%%time
## Write everything into one single HDF5 file indexed by keys for the various symbols
folder_path = "/Users/john.doe/Documents/Dukascopy_Historical_Data/"
for symbol in bid_ask_files.keys():
    print('',symbol)
    ask_csv_file = folder_path + bid_ask_files[symbol]["Ask"]
    bid_csv_file = folder_path + bid_ask_files[symbol]["Bid"]
    print(ask_csv_file,'\n',bid_csv_file)
    df = read_bid_ask_data(ask_csv_file, bid_csv_file, set_time_index = True)
    df.to_hdf(f"/Users/john.doe/Documents/vbtpro_tutorials/data/MultiAsset_OHLCV_3Y_m1.h5", key=symbol)