# Part 0 - Data Preparation
Before I can start I need to get data of transactions. I'm using BTC-USDT trades at Binance Perpetual Future.
The data downloaded from https://www.binance.com/en/landing/data .

The avg monthly data size *compressed* is almost 900MB. 
The avg monthly data size *uncompressed* csv is 4.7GB. 

CSV files are heavy and slow to load to pandas so the first thing that im doing is to transform it to parquet so I can reuse it later much faster and less storage needed.

The avg monthly parquet data size  is 1.3GB.


In [3]:
import os

import numpy as np
from zipfile import ZipFile
import pandas as pd
from tqdm import tqdm as TQDM
from tqdm.notebook import tqdm


def extract_rar(file_path: str, extract_to: str, pbar: TQDM) -> str | None:
    try:
        pbar.set_description(f"Extracting: {file_path}")
        with ZipFile(file_path, 'r') as zObject:
            zObject.extractall(extract_to)
        pbar.set_description(f"Extracted: {file_path}")
        return os.path.join(extract_to, zObject.namelist()[0])
    except Exception as e:
        print(f"Extraction failed: {file_path}, {str(e)}")
        return None


def load_csv_with_columns(file_path):
    """
    Some files don't have the columns' name in it. In case that it doesnt has the columns row ill add it manually
    """
    first_row = pd.read_csv(file_path, nrows=1)
    column_names = ['id', 'price', 'qty', 'quote_qty', 'time', 'is_buyer_maker']
    if all(col in first_row.columns for col in column_names):
        df = pd.read_csv(file_path)
    else:
        df = pd.read_csv(file_path, header=None)
        df.columns = column_names
    return df


def process_csv_and_convert_to_parquet(csv_file_path: str, pbar: TQDM) -> str | None:
    try:
        pbar.set_description(f"Read: {csv_file_path}")
        df = load_csv_with_columns(csv_file_path)

        parquet_file_path = csv_file_path.replace('.csv', '.parquet')
        pbar.set_description(f"Save as {parquet_file_path}")
        df.to_parquet(parquet_file_path)
        pbar.set_description(f"Saved as {parquet_file_path}")

        return parquet_file_path
    except Exception as e:
        print(f"Failed to process {csv_file_path}: {str(e)}")
        return None


def iterate_and_process(folder_path: str) -> None:
    # Check if the folder path exists
    if not os.path.exists(folder_path):
        print(f"The folder '{folder_path}' does not exist.")
        return

    # Iterate over the files in the folder
    root, dirs, files = next(iter(os.walk(folder_path)))
    files = sorted(files)
    files = [
        '/Volumes/Extreme Pro/transactions/BTCUSDT-trades-2022-01.zip',
        '/Volumes/Extreme Pro/transactions/BTCUSDT-trades-2023-01.zip',
    ]
    for file in (pbar := tqdm(files)):
        if file.endswith('.zip'):
            file_path = os.path.join(root, file)
            extracted_csv_path = extract_rar(file_path, folder_path, pbar)
            # extracted_csv_path = file_path

            if extracted_csv_path and extracted_csv_path.endswith('.csv'):
                process_csv_and_convert_to_parquet(extracted_csv_path, pbar)


transaction_path = r'/Volumes/Extreme Pro/transactions'
iterate_and_process(transaction_path)


  0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
import humanize
import os
import numpy as np


def get_file_sizes(folder_path, file_type):
    file_sizes = []

    # Check if the folder path exists
    if not os.path.exists(folder_path):
        print(f"The folder '{folder_path}' does not exist.")
        return file_sizes

    # Iterate over the files in the folder
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith(file_type):
                file_path = os.path.join(root, file)
                file_size = os.path.getsize(file_path)
                file_sizes.append(file_size)

    print(f'Mean {file_type} file size {humanize.naturalsize(sum(file_sizes) / len(file_sizes), binary=True)}')
    print(f'Std {file_type} file size {humanize.naturalsize(np.std(file_sizes))}')
    print(f'Total {file_type} file size {humanize.naturalsize(np.sum(file_sizes))}')


# Specify the folder path
transaction_path = r'E:\transactions'
print('Zip file sizes:')
get_file_sizes(transaction_path, file_type='zip')
print('CSV file sizes:')
get_file_sizes(transaction_path, file_type='csv')
print('Parquet file sizes:')
get_file_sizes(transaction_path, file_type='parquet')

Zip file sizes:
The folder 'E:\transactions' does not exist.
CSV file sizes:
The folder 'E:\transactions' does not exist.
Parquet file sizes:
The folder 'E:\transactions' does not exist.


[]

In the next chapter, we work on bars generation. Its impossible to work with this size of data a local PC (mine is 64GB ram).
So my bars generators are rolling, means that I iterate over the monthly transactions and create for each of them a bars (accepting the last bar transactions to roll the last bar)  