In [10]:
import yfinance as yf
import pandas as pd
import os
from datetime import datetime, timezone

In [12]:
# Define the tickers and date range
tickers = {
    "coal": "MTF=F",
    "gas": "TTF=F"
}
start_date = "2019-01-01"
end_date = "2024-07-31"

# Function to fetch data for a single ticker
def fetch_data(ticker):
    data = yf.Ticker(ticker).history(start=start_date, end=end_date)
    return data['Close']

# Fetch data for both tickers
data = {}
for commodity, ticker in tickers.items():
    data[commodity] = fetch_data(ticker)
    
# Convert index to UTC but keep timezone info
for commodity in data:
    data[commodity].index = pd.to_datetime(data[commodity].index).tz_convert(timezone.utc)

# Function to save data as parquet files organized by year
def save_as_parquet(series, commodity):
    base_folder = os.path.join("raw", commodity)
    os.makedirs(base_folder, exist_ok=True)
    
    # Convert Series to DataFrame and ensure correct column order
    df = pd.DataFrame({'Date': series.index, commodity: series.values})
    
    # Group data by year and save each year's data separately
    for year, year_data in df.groupby(df['Date'].dt.year):
        year_folder = os.path.join(base_folder, str(year))
        os.makedirs(year_folder, exist_ok=True)
        
        file_name = f"{commodity}_{year}.parquet"
        file_path = os.path.join(year_folder, file_name)
        
        # Save as parquet
        year_data.to_parquet(file_path, index=False)
        print(f"Saved {file_path}")

# Save data for each commodity
for commodity, prices in data.items():
    save_as_parquet(prices, commodity)

print("Data processing and saving completed.")

# Display the first few rows of each dataset
for commodity, prices in data.items():
    print(f"\nFirst few rows of {commodity.capitalize()} data:")
    print(prices.head())

Saved raw/coal/2019/coal_2019.parquet
Saved raw/coal/2020/coal_2020.parquet
Saved raw/coal/2021/coal_2021.parquet
Saved raw/coal/2022/coal_2022.parquet
Saved raw/coal/2023/coal_2023.parquet
Saved raw/coal/2024/coal_2024.parquet
Saved raw/gas/2019/gas_2019.parquet
Saved raw/gas/2020/gas_2020.parquet
Saved raw/gas/2021/gas_2021.parquet
Saved raw/gas/2022/gas_2022.parquet
Saved raw/gas/2023/gas_2023.parquet
Saved raw/gas/2024/gas_2024.parquet
Data processing and saving completed.

First few rows of Coal data:
Date
2019-01-02 05:00:00+00:00    87.449997
2019-01-03 05:00:00+00:00    83.699997
2019-01-04 05:00:00+00:00    83.349998
2019-01-07 05:00:00+00:00    81.599998
2019-01-08 05:00:00+00:00    82.150002
Name: Close, dtype: float64

Last few rows of Coal data:
Date
2019-01-02 05:00:00+00:00    87.449997
2019-01-03 05:00:00+00:00    83.699997
2019-01-04 05:00:00+00:00    83.349998
2019-01-07 05:00:00+00:00    81.599998
2019-01-08 05:00:00+00:00    82.150002
Name: Close, dtype: float64

Fi