In [None]:
# Import necessary libraries for AWS S3 and data handling
import boto3
import os
from minio import Minio
from huggingface_hub import list_repo_files
import requests

In [None]:
# Configuration for dataset and MinIO
DATASET_ID = "Zihan1004/FNSPID"
S3_BUCKET = "fnf-bucket"

# MinIO Configuration
MINIO_ENDPOINT = "minio:9000"
MINIO_ACCESS_KEY = "minioadmin"
MINIO_SECRET_KEY = "minioadmin"

In [None]:
# Create S3 client for interacting with MinIO
s3_client = boto3.client(
    's3',
    endpoint_url='http://minio:9000',
    aws_access_key_id=MINIO_ACCESS_KEY,
    aws_secret_access_key=MINIO_SECRET_KEY,
    use_ssl=False
)

In [None]:
# Check if the S3 bucket exists, create it if not
try:
    s3_client.head_bucket(Bucket=S3_BUCKET)
    print(f"Bucket '{S3_BUCKET}' already exists")
except:
    s3_client.create_bucket(Bucket=S3_BUCKET)
    print(f"Created bucket '{S3_BUCKET}'")

Bucket 'fnf-bucket' already exists


In [None]:
# Fetch file list from Hugging Face dataset repository
print("\nFetching file list from Hugging Face...")
files = list_repo_files(
    repo_id=DATASET_ID,
    repo_type="dataset"
)


Fetching file list from Hugging Face...


In [None]:
# Display the fetched files
files

['README.md',
 'Stock_news/All_external.csv',
 'Stock_news/nasdaq_exteral_data.csv',
 'Stock_price/full_history.zip']

In [None]:
# Function to download and upload news data to S3
def get_news_data():
    hf_url = f"https://huggingface.co/datasets/{DATASET_ID}/resolve/main/Stock_news/nasdaq_exteral_data.csv"
    response = requests.get(hf_url, stream=True)
    response.raise_for_status()
    s3_key = "bronze/stock_news/nasdaq_exteral_data.csv"
    try:
        s3_client.upload_fileobj(
            response.raw,
            S3_BUCKET,
            s3_key
        )
        print("Upload successful!")
    except Exception as e:
        print(f"Error uploading to S3: {e}")

In [None]:
# Call the function to get news data
get_news_data()

Upload successful!


In [None]:
# Function to download and upload stock price data to S3
def get_stocks_data():
    hf_url = f"https://huggingface.co/datasets/{DATASET_ID}/resolve/main/Stock_price/full_history.zip"
    response = requests.get(hf_url, stream=True)
    response.raise_for_status()
    s3_key = "bronze/stock_price/full_history.zip"
    try:
        s3_client.upload_fileobj(
            response.raw,
            S3_BUCKET,
            s3_key
        )
        print("Upload successful!")
    except Exception as e:
        print(f"Error uploading to S3: {e}")

In [None]:
# Call the function to get stock data
get_stocks_data()

Upload successful!


In [None]:
# Import additional libraries for data processing
import pandas as pd
import requests
from io import StringIO

In [None]:
# Define URL for S&P 500 companies list and set headers for the request
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}

In [None]:
# Fetch the S&P 500 companies table from Wikipedia
response = requests.get(url, headers=headers)
tables = pd.read_html(StringIO(response.text))

In [None]:
# Extract the first table which contains the S&P 500 companies
sp500_table = tables[0]

In [None]:
# Create a list of stock symbols from the S&P 500 table
stock_list = sp500_table['Symbol'].to_list()

In [None]:
# Function to extract CSV files from a ZIP archive in S3
from io import BytesIO
import zipfile
from tqdm import tqdm
import os

def extract_zip(stock_list):
    # Convert to set for faster lookup
    stock_set = set(stock_list)

    # Get the object and read from the Body
    response = s3_client.get_object(
        Bucket=S3_BUCKET,
        Key="bronze/stock_price/full_history.zip"
    )
    zip_bytes = BytesIO(response['Body'].read())

    with zipfile.ZipFile(zip_bytes, "r") as z:
        file_list = [
            name for name in z.namelist()
            if "__MACOSX" not in name and name.endswith(".csv")
        ]

        for name in tqdm(file_list, desc="Extracting ZIP", unit="file"):
            # Extract stock symbol from filename
            stock_symbol = os.path.splitext(os.path.basename(name))[0].upper()

            # Skip files not in stock_list
            if stock_symbol not in stock_set:
                continue

            file_data = z.read(name)

            s3_client.put_object(
                Bucket=S3_BUCKET,
                Key=f"bronze/stock_price/{name}",
                Body=BytesIO(file_data),
                ContentLength=len(file_data)
            )

In [None]:
# Call the function to extract stock data from the ZIP file
extract_zip(stock_list)

Extracting ZIP: 100%|██████████| 7693/7693 [01:09<00:00, 111.23file/s]
