## Data

In [5]:
import os
import requests
import time
import zipfile
import shutil
from tqdm import tqdm
import json
from kaggle.api.kaggle_api_extended import KaggleApi


In [6]:
DATA_DIR = "data"
FULL_DATA_DIR = os.path.join(DATA_DIR, "full-data")
KAGGLE_JSON_PATH = "kaggle.json" 

### Downloading and cleaning initial data files

Data source: https://github.com/Zdong104/FNSPID_Financial_News_Dataset

#### News & Stock Market Data

In [None]:


def download_files(urls):
    os.makedirs(FULL_DATA_DIR, exist_ok=True)

    for url in urls:
        filename = os.path.basename(url)
        filepath = os.path.join(FULL_DATA_DIR, filename)
        start_time = time.time()

        response = requests.get(url, stream=True)
        total_size = int(response.headers.get("content-length", 0))

        if response.status_code == 200:
            with open(filepath, "wb") as f, tqdm(
                total=total_size, unit="B", unit_scale=True, desc=f"Downloading {filename}"
            ) as progress_bar:
                for chunk in response.iter_content(1024):
                    f.write(chunk)
                    progress_bar.update(len(chunk))

            elapsed_time = time.time() - start_time
            print(f"✅ Downloaded: {filepath} in {elapsed_time:.2f} seconds")

            if filename.endswith(".zip"):
                extract_zip(filepath)

        else:
            print(f"❌ Failed to download: {url}")

def extract_zip(zip_path):
    if os.path.exists(zip_path):
        print(f"📦 Extracting {zip_path}...")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(FULL_DATA_DIR)
        print(f"✅ Extracted to {FULL_DATA_DIR}")

        clean_unwanted_files(FULL_DATA_DIR)
        os.remove(zip_path)
        print(f"🗑️ Deleted ZIP file: {zip_path}")

def clean_unwanted_files(directory):
    macosx_path = os.path.join(directory, "__MACOSX")
    if os.path.exists(macosx_path):
        shutil.rmtree(macosx_path)
        print(f"🗑️ Removed: {macosx_path}")

urls = [
    "https://huggingface.co/datasets/Zihan1004/FNSPID/resolve/main/Stock_price/full_history.zip",
    "https://huggingface.co/datasets/Zihan1004/FNSPID/resolve/main/Stock_news/nasdaq_exteral_data.csv"
]

download_files(urls)


Downloading full_history.zip: 100%|██████████| 590M/590M [00:10<00:00, 55.2MB/s] 


✅ Downloaded: data/full-data/full_history.zip in 11.35 seconds
📦 Extracting data/full-data/full_history.zip...
✅ Extracted to data/full-data
🗑️ Removed: data/full-data/__MACOSX
🗑️ Deleted ZIP file: data/full-data/full_history.zip


Downloading nasdaq_exteral_data.csv: 100%|██████████| 23.2G/23.2G [07:38<00:00, 50.6MB/s] 

✅ Downloaded: data/full-data/nasdaq_exteral_data.csv in 459.38 seconds





#### Bitcoin Historical Data

Data source: https://www.kaggle.com/datasets/mczielinski/bitcoin-historical-data

In [None]:
def setup_kaggle_credentials(json_path):
    """Load and set Kaggle credentials from a local JSON file."""
    if not os.path.exists(json_path):
        raise FileNotFoundError(f"❌ Kaggle credentials file not found at {json_path}")

    with open(json_path, "r") as f:
        creds = json.load(f)

    os.environ["KAGGLE_USERNAME"] = creds["username"]
    os.environ["KAGGLE_KEY"] = creds["key"]
    print("Kaggle API credentials set up successfully!")

def download_kaggle_dataset(dataset):
    """Download and extract the dataset from Kaggle."""
    os.makedirs(FULL_DATA_DIR, exist_ok=True)

    setup_kaggle_credentials(KAGGLE_JSON_PATH)

    api = KaggleApi()
    api.authenticate()

    print(f"Downloading {dataset} from Kaggle...")
    api.dataset_download_files(dataset, path=FULL_DATA_DIR, unzip=True)
    print(f"✅ Downloaded and extracted: {dataset}")

    clean_unwanted_files(FULL_DATA_DIR)

download_kaggle_dataset("mczielinski/bitcoin-historical-data")


Kaggle API credentials set up successfully!
Downloading mczielinski/bitcoin-historical-data from Kaggle...
Dataset URL: https://www.kaggle.com/datasets/mczielinski/bitcoin-historical-data
✅ Downloaded and extracted: mczielinski/bitcoin-historical-data


This dataset contains historical Bitcoin price data recorded at one-minute intervals. The main file included is:
- `btcusd_1-min_data.csv`: Contains one-minute price data for Bitcoin, including Open, High, Low, Close (OHLC) prices, volume, and timestamp information.

#### ETH

Source: https://www.kaggle.com/datasets/prasoonkottarathil/ethereum-historical-dataset

In [42]:
download_kaggle_dataset("prasoonkottarathil/ethereum-historical-dataset")


Kaggle API credentials set up successfully!
Downloading prasoonkottarathil/ethereum-historical-dataset from Kaggle...
Dataset URL: https://www.kaggle.com/datasets/prasoonkottarathil/ethereum-historical-dataset
✅ Downloaded and extracted: prasoonkottarathil/ethereum-historical-dataset


This dataset provides a collection of Ethereum price data at different time intervals, making it useful for market analysis and trading strategies. It includes three CSV files:

- `ETH_1H.csv`: Contains hourly Ethereum price data, including OHLC prices and volume.
- `ETH_1min.csv`: Contains one-minute Ethereum price data, similar to Bitcoin's dataset.
- `ETH_day.csv`: Contains daily Ethereum price data.