### Config

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from chronos_mlx import ChronosPipeline
pd.set_option('display.max_columns', None)

In [2]:
quart = pd.read_parquet('/Users/tomaltenborg/Documents/Master/Master thesis/Notebooks/M3 Data/M3_quarter_processed.parquet')

## Loop and Produce Forecasts for All Quarterly Series

In [3]:
pipeline = ChronosPipeline.from_pretrained(
    "amazon/chronos-t5-base",
    dtype="bfloat16",
)

unique_series = quart['Series'].unique()

forecasts = {}

counter = 0

for series_id in unique_series:

    series_data = quart[quart['Series'] == series_id]
    
    # Ensure the data is sorted by date
    series_data = series_data.sort_values('Date')
    
    # Get the number of observations to use for forecasting
    N = series_data['N'].iloc[0]
    NF = series_data['NF'].iloc[0]
    
    # Get the date of the last observation used for training
    last_training_date = series_data['Date'].iloc[N-NF-1] 

    # Prepare the input data for the model
    training_data = series_data['Value'].iloc[:N-NF].values  # Exclude the last NF values
    
    # Calculate forecast dates assuming quarterly data
    forecast_dates = pd.date_range(start=last_training_date, periods=NF+1, freq='Q')[1:]  # Skip the first since it's the last training date
    
    # Predict the next NF values
    try:
        model_forecasts = pipeline.predict(
            context=training_data,
            prediction_length=NF,
            num_samples=N-NF
        )
    except Exception as e:
        print(f"Failed to predict for series {series_id} with error: {e}")
        continue
    counter += 1
    if counter > 100:
        print('100 series forecasted')
        counter = 0
    # Quantiles for uncertainty bounds
    low, median, high = np.quantile(model_forecasts[0], [0.1, 0.5, 0.9], axis=0)
    forecasts[series_id] = {
        'Date': forecast_dates,
        'low': low,
        'median': median,
        'high': high
    }

# Convert forecasts dictionary to a DataFrame for easier manipulation
forecast_df = pd.DataFrame()
for series_id, data in forecasts.items():
    df_temp = pd.DataFrame({
        'Date': data['Date'],
        'Series_ID': series_id,
        'Low': data['low'],
        'Median': data['median'],
        'High': data['high']
    })
    forecast_df = pd.concat([forecast_df, df_temp], ignore_index=True)

# forecast_df.set_index(['Date', 'Series_ID'], inplace=True)
print('Forecasting completed for all series.')

100 series forecasted
100 series forecasted
100 series forecasted
100 series forecasted
100 series forecasted
100 series forecasted
100 series forecasted
Forecasting completed for all series.


In [4]:
forecast_df

Unnamed: 0,Date,Series_ID,Low,Median,High
0,1993-03-31,646,5395.203768,5562.237940,5645.755026
1,1993-06-30,646,5311.686681,5545.534523,5745.975529
2,1993-09-30,646,5211.466178,5545.534523,5796.085781
3,1993-12-31,646,5194.762761,5495.424271,5846.196033
4,1994-03-31,646,5144.652509,5478.720854,5996.526788
...,...,...,...,...,...
6043,1974-12-31,1401,3552.554252,4674.413490,5344.706745
6044,1975-03-31,1401,3383.217009,4498.020528,5376.457478
6045,1975-06-30,1401,2910.483871,4533.299120,5753.938416
6046,1975-09-30,1401,3203.296188,4815.527859,5965.609971


Slight issue. The datetime values here are a bit different from that in the processed parquet file, still the same quarter, but at the end of the month instead of the start. Should hardly matter. 

Need to decide how to use the low, median and high. Natural to just use median. For now save all of it.

In [5]:
forecast_df.to_parquet('M3_Chronos_quarter_forecast.parquet', index=False,  compression='gzip', engine='pyarrow')