# Download NatSD Data

In [1]:
import ccxt
import pandas as pd
import time
import os

In [2]:
symbols = ["XRP/USDT", "BTC/USDT", "ETH/USDT"]
start_ts = "2020-09-01T00:00:00Z"
end_ts = "2023-09-01T00:00:00Z"

In [3]:
download_dir = "./downloaded_data/"

if not os.path.exists(download_dir):
    os.makedirs(download_dir)

In [4]:
def download_data_from_binance(symbol, start_ts, end_ts):
    # Initialize the Binance API
    binance = ccxt.binance()

    # Define the symbol and timeframe
    timeframe = "1m"  # 1 minute

    # Define the date range
    since = binance.parse8601(start_ts)
    end = binance.parse8601(end_ts)

    # Check if the CSV file already exists.
    # If it does, read the last timestamp to avoid downloading duplicate data.
    try:
        existing_df = pd.read_csv(f"year_{symbol[0:3]}_historical_data_1m.csv")
        # since = pd.to_datetime(existing_df["timestamp"]).max().value // 10**6
        print(f"Resuming from timestamp: {since}")
    except FileNotFoundError:
        # Initialize an empty DataFrame to store the data if the file doesn't exist
        df = pd.DataFrame(
            columns=["timestamp", "open", "high", "low", "close", "volume", "symbol"]
        )
        df.to_csv(f"./{download_dir}/{symbol[0:3]}_historical_data_1m.csv", index=False)

    # Loop to paginate through the data
    while since < end:
        print(f"Since: {binance.iso8601(since)}")
        # Download a chunk of historical data
        ohlcv = binance.fetch_ohlcv(symbol, timeframe, since)

        # If no new data, break the loop
        if len(ohlcv) == 0:
            break

        # Convert the chunk to a DataFrame
        chunk_df = pd.DataFrame(
            ohlcv, columns=["timestamp", "open", "high", "low", "close", "volume"]
        )

        # Convert the timestamp to a readable date format
        chunk_df["timestamp"] = pd.to_datetime(chunk_df["timestamp"], unit="ms")

        chunk_df["symbol"] = symbol

        # Append the chunk to the CSV file
        chunk_df.to_csv(
            f"./{download_dir}/{symbol[0:3]}_historical_data_1m.csv",
            mode="a",
            header=False,
            index=False,
        )

        # Update the 'since' parameter for the next iteration
        since = ohlcv[-1][0] + 1  # Timestamp of the last entry + 1 millisecond

        # Sleep for a while to avoid hitting the API rate limit
        time.sleep(1)

In [5]:
for symbol in symbols:
    download_data_from_binance(symbol, start_ts, end_ts)

Since: 2020-09-01T00:00:00.000Z
Since: 2020-09-01T08:19:00.001Z
Since: 2020-09-01T16:39:00.001Z
Since: 2020-09-02T00:59:00.001Z
Since: 2020-09-02T09:19:00.001Z
Since: 2020-09-02T17:39:00.001Z
Since: 2020-09-03T01:59:00.001Z
Since: 2020-09-03T10:19:00.001Z
Since: 2020-09-03T18:39:00.001Z
Since: 2020-09-04T02:59:00.001Z
Since: 2020-09-04T11:19:00.001Z
Since: 2020-09-04T19:39:00.001Z
Since: 2020-09-05T03:59:00.001Z
Since: 2020-09-05T12:19:00.001Z
Since: 2020-09-05T20:39:00.001Z
Since: 2020-09-06T04:59:00.001Z
Since: 2020-09-06T13:19:00.001Z
Since: 2020-09-06T21:39:00.001Z
Since: 2020-09-07T05:59:00.001Z
Since: 2020-09-07T14:19:00.001Z
Since: 2020-09-07T22:39:00.001Z
Since: 2020-09-08T06:59:00.001Z
Since: 2020-09-08T15:19:00.001Z
Since: 2020-09-08T23:39:00.001Z
Since: 2020-09-09T07:59:00.001Z
Since: 2020-09-09T16:19:00.001Z
Since: 2020-09-10T00:39:00.001Z
Since: 2020-09-10T08:59:00.001Z
Since: 2020-09-10T17:19:00.001Z
Since: 2020-09-11T01:39:00.001Z
Since: 2020-09-11T09:59:00.001Z
Since: 2