In [1]:
import yfinance as yf
import pandas as pd
import os
from datetime import datetime, timedelta
from pytz import timezone, UTC

In [2]:
# Define the tickers and date range
tickers = {
    "coal": "MTF=F",  # API2 Rotterdam Coal Futures
    "gas": "TTF=F"   # Title Transfer Facility (TTF) Natural Gas Futures
}
start_date = "2019-01-01"
current_date = datetime.now().date()
two_days_ago = current_date - timedelta(days=1)
end_date = two_days_ago.strftime("%Y-%m-%d")

# Function to fetch data for a single ticker
def fetch_data(ticker):
    data = yf.Ticker(ticker).history(start=start_date, end=end_date)
    return data['Close']

# Fetch data for both tickers
data = {}
for commodity, ticker in tickers.items():
    data[commodity] = fetch_data(ticker)
    
# Convert index to UTC but keep timezone info
for commodity in data:
    data[commodity].index = pd.to_datetime(data[commodity].index).tz_convert(UTC)

# Function to save data as parquet files organized by year
def save_as_parquet(series, commodity):
    """
    Save a time series as parquet files, grouped by year but without year subfolders.

    Args:
        series (pd.Series): The time series to save.
        commodity (str): The name of the commodity, used for folder and file naming.

    Returns:
        None
    """
    base_folder = os.path.join("raw", commodity)
    os.makedirs(base_folder, exist_ok=True)
    
    # Convert Series to DataFrame and ensure correct column order
    df = pd.DataFrame({'Date': series.index, commodity: series.values})
    
    # Group data by year and save each year's data separately
    for year, year_data in df.groupby(df['Date'].dt.year):
        file_name = f"{commodity}_{year}.parquet"
        file_path = os.path.join(base_folder, file_name)
        
        # Save as parquet
        year_data.to_parquet(file_path, index=False)
        print(f"Saved {file_path}")

# Save data for each commodity
for commodity, prices in data.items():
    save_as_parquet(prices, commodity)

print(f"Data processing and saving completed. Data fetched up to {end_date}")

# Display the first few rows of each dataset
for commodity, prices in data.items():
    print(f"\nFirst few rows of {commodity.capitalize()} data:")
    print(prices.head())

Saved raw/coal/coal_2019.parquet
Saved raw/coal/coal_2020.parquet
Saved raw/coal/coal_2021.parquet
Saved raw/coal/coal_2022.parquet
Saved raw/coal/coal_2023.parquet
Saved raw/coal/coal_2024.parquet
Saved raw/gas/gas_2019.parquet
Saved raw/gas/gas_2020.parquet
Saved raw/gas/gas_2021.parquet
Saved raw/gas/gas_2022.parquet
Saved raw/gas/gas_2023.parquet
Saved raw/gas/gas_2024.parquet
Data processing and saving completed. Data fetched up to 2024-08-06

First few rows of Coal data:
Date
2019-01-02 05:00:00+00:00    87.449997
2019-01-03 05:00:00+00:00    83.699997
2019-01-04 05:00:00+00:00    83.349998
2019-01-07 05:00:00+00:00    81.599998
2019-01-08 05:00:00+00:00    82.150002
Name: Close, dtype: float64

First few rows of Gas data:
Date
2019-01-02 05:00:00+00:00    22.475000
2019-01-03 05:00:00+00:00    22.254999
2019-01-04 05:00:00+00:00    22.930000
2019-01-07 05:00:00+00:00    21.530001
2019-01-08 05:00:00+00:00    22.090000
Name: Close, dtype: float64
