## Dataset Preparation

In [1]:
# Import Dependencies and Define Functions
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def load_dataframes(comb_dict):
    dataframes = {}
    for source, assets in comb_dict.items():
        dataframes[source] = {}
        for asset, timeframes in assets.items():
            dataframes[source][asset] = {}
            for timeframe, info in timeframes.items():
                
                # Read CSV file
                df = pd.read_csv(info['file_path'], parse_dates=True)
                df['open_time'] = pd.to_datetime(df['open_time'])
                df.set_index('open_time', inplace=True)
                dataframes[source][asset][timeframe] = df
                print(f"Loaded {source} {asset} {timeframe} data: {df.shape[0]} rows")
    
    return dataframes

def merge_dataframes(dataframes):
    merged_df = pd.DataFrame()

    for source, assets in dataframes.items():
        for asset, timeframes in assets.items():
            for timeframe, df in timeframes.items():
                merged_df = pd.concat([merged_df, df], axis=1)
                
    return merged_df

In [2]:
# Fetch Data from Binance API and create CSV files
!python binance_scraper.py --coin "BTC,ETH" --interval "1h,1d" --start_time "2020-01-01T20:00:00" --end_time "2025-01-01T20:00:00" --config "./config_indicators.yaml" --save_folder "./data/raw_csvs"

Saved data in "./data/raw_csvs/BTC_1h_from_2020-01-01T20-00-00_to_2025-01-01T20-00-00.csv"
Saved data in "./data/raw_csvs/BTC_1d_from_2020-01-01T20-00-00_to_2025-01-01T20-00-00.csv"
Saved data in "./data/raw_csvs/ETH_1h_from_2020-01-01T20-00-00_to_2025-01-01T20-00-00.csv"
Saved data in "./data/raw_csvs/ETH_1d_from_2020-01-01T20-00-00_to_2025-01-01T20-00-00.csv"


In [3]:
# Copy paste the CSV file paths from the binance_scraper.py output to the comb_dict below
# Example of CSV file paths (these should be replaced with actual paths)    
comb_dict = {
    "binance": {
        "BTC": {
            "1h": {"file_path": "./data/raw_csvs/BTC_1h_from_2020-01-01T20-00-00_to_2025-01-01T20-00-00.csv"},
            "1d": {"file_path": "./data/raw_csvs/BTC_1d_from_2020-01-01T20-00-00_to_2025-01-01T20-00-00.csv"}
        },
        "ETH": {
            "1h": {"file_path": "./data/raw_csvs/ETH_1h_from_2020-01-01T20-00-00_to_2025-01-01T20-00-00.csv"},
            "1d": {"file_path": "./data/raw_csvs/ETH_1d_from_2020-01-01T20-00-00_to_2025-01-01T20-00-00.csv"}
        }
    }
}

In [4]:
dataframes = load_dataframes(comb_dict) # Load dataframes from CSV files

Loaded binance BTC 1h data: 43817 rows
Loaded binance BTC 1d data: 1827 rows
Loaded binance ETH 1h data: 43817 rows
Loaded binance ETH 1d data: 1827 rows


In [5]:
# Resample DataFrames to merge different timeframes
dataframes["binance"]["BTC"]["1d"] = dataframes["binance"]["BTC"]["1d"].resample("1h").ffill().shift(24)
dataframes["binance"]["ETH"]["1d"] = dataframes["binance"]["ETH"]["1d"].resample("1h").ffill().shift(24)

In [6]:
merged_df = merge_dataframes(dataframes) # Merge all dataframes into one
merged_df.dropna(inplace=True) # Drop rows with NaN values

save_path = "./data/dataset.csv"
merged_df.to_csv(save_path) # Save merged dataframe to CSV