In [42]:
# !pip install -r /content/drive/MyDrive/thesis_nova_2025/requirements.txt

In [43]:
# from google.colab import drive
# drive.mount('/content/drive')

In [44]:
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error

import pandas as pd
import numpy as np

import psycopg2
from psycopg2 import sql, OperationalError
from sqlalchemy import create_engine


pd.set_option('display.max_columns', None)

In [45]:
SHIFT = 10

# Database connection

In [46]:
DB_HOST = ''
DB_PORT = ''
DB_NAME = ''
USERNAME = ''
PASSWORD = ''
DB_URL = f'postgresql://{USERNAME}:{PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}'


def create_connection():
    try:
        conn = psycopg2.connect(
            host=DB_HOST,
            port=DB_PORT,
            database=DB_NAME,
            user=USERNAME,
            password=PASSWORD,
            sslmode="disable"
        )
        print("[OK] -- Connection successful")
        return conn
    except OperationalError as e:
        print(f"[FAILED] -- Connection failed: {e}")
        return None

def close_connection(conn):
    if conn:
        conn.close()
        print("[OK] -- Connection closed")

def get_engine():
    try:
        engine = create_engine(DB_URL)
        return engine

    except Exception as e:
        return None

In [47]:
df = pd.read_sql("select * from spy_ohlcv_proc where date > '1994-01-01'", get_engine())
df = df.drop(columns = ['ticker'])
print(f'Data size: {df.shape[0]} x {df.shape[1]}')
df.head()

Data size: 7894 x 40


Unnamed: 0,date,open,high,low,close,volume,dividends,stock_splits,capital_gains,sma_7,ema_7,sma_14,ema_14,sma_20,ema_20,sma_50,ema_50,sma_100,ema_100,sma_200,ema_200,ema_12,ema_26,macd,macd_signal,rsi,bb_mid,bb_std,bb_upper,bb_lower,obv,mom_10,mom_20,mom_50,atr_14,atr_30,signal_sma,signal_macd,signal_rsi,signal_sma_cross
0,1994-01-03 08:00:00,26.582126,26.617783,26.475156,26.510813,960900,0.0,0.0,0.0,26.714566,26.645905,26.605839,26.616542,26.577265,26.579774,26.424288,26.431779,26.234339,26.208814,25.664632,25.939678,26.628583,26.546541,0.082042,0.088916,48.132827,26.577265,0.151651,26.880568,26.273963,3501900,-0.05349,0.038597,0.109415,0.140069,0.138555,1,0,0,1
1,1994-01-04 08:00:00,26.54648,26.617794,26.510824,26.617794,164300,0.0,0.0,0.0,26.706928,26.638877,26.623827,26.616709,26.580117,26.583395,26.431095,26.439074,26.24676,26.216985,25.672522,25.947129,26.626923,26.551819,0.075104,0.086154,60.074843,26.580117,0.151861,26.883838,26.276395,3666200,0.017841,0.057032,0.34033,0.132533,0.13858,1,0,0,1
2,1994-01-05 08:00:00,26.653432,26.689089,26.546462,26.671261,710900,0.0,0.0,0.0,26.68655,26.646973,26.648163,26.623983,26.586527,26.591763,26.437554,26.44818,26.259365,26.22606,25.680417,25.955077,26.633744,26.560666,0.073078,0.083539,63.436834,26.586527,0.152916,26.892359,26.280694,4377100,0.160448,0.128202,0.322975,0.136396,0.135071,1,0,0,1
3,1994-01-06 08:00:00,26.706918,26.724746,26.635604,26.671261,201000,0.0,0.0,0.0,26.658532,26.653045,26.666175,26.630286,26.592937,26.599334,26.444368,26.456929,26.270561,26.234952,25.688312,25.96293,26.639516,26.568859,0.070657,0.080962,60.691567,26.592937,0.153683,26.900303,26.28557,4377100,-0.017841,0.128202,0.340708,0.135174,0.13273,1,0,0,1
4,1994-01-07 08:00:00,26.724761,26.84956,26.653448,26.831732,775500,0.0,0.0,0.0,26.658532,26.697717,26.685277,26.657146,26.611798,26.621467,26.454037,26.471628,26.282483,26.24687,25.696223,25.972447,26.669088,26.588331,0.080757,0.080921,61.193875,26.611798,0.158861,26.92952,26.294076,5152600,0.160471,0.37722,0.483446,0.137537,0.135136,1,0,0,1


In [48]:
df_L5Y = df[df['date']>='2020-01-01'].reset_index(drop=True)
df_L10Y = df[df['date']>='2015-01-01'].reset_index(drop=True)
df_L15Y = df[df['date']>='2010-01-01'].reset_index(drop=True)
df_L20Y = df[df['date']>='2005-01-01'].reset_index(drop=True)


stats = pd.DataFrame({
    'L5Y': [df_L5Y.shape[0]],
    'L10Y': [df_L10Y.shape[0]],
    'L15Y': [df_L15Y.shape[0]],
    'L20Y': [df_L20Y.shape[0]]
}).T

stats.columns = ['Size']
stats.index.name = 'Horizon'
stats

Unnamed: 0_level_0,Size
Horizon,Unnamed: 1_level_1
L5Y,1348
L10Y,2606
L15Y,3864
L20Y,5123


# Features and target

In [49]:
target = f'close'

features = [_ for _ in df.columns.tolist() if _ not in ['date', 'open', 'high', 'low', target]]

prediction_length = SHIFT

print(f'Features: {len(features)}')
print(f'Target: {target}')
print(f'Horizon: {prediction_length}')

Features: 35
Target: close
Horizon: 10


# Chronos Bolt testing

In [50]:
EVAL_METRIC = "MAPE"
TIME_LIMIT = 600 # seconds

frames = {
    'L5Y': df_L5Y,
    'L10Y': df_L10Y,
    'L15Y': df_L15Y,
    'L20Y': df_L20Y
}

hyperparameters = {
    "AutoARIMAModel": {},  # Keep as baseline
    "Chronos": [
        # Zero-shot models (your best performers)
        {
            "model_path": "bolt_base",
            "ag_args": {"name_suffix": "ZeroShot"}
        },
        {
            "model_path": "bolt_small",
            "ag_args": {"name_suffix": "ZeroShot"}
        },

        # Very conservative fine-tuning for financial data
        {
            "model_path": "bolt_base",
            "fine_tune": True,
            "fine_tune_lr": 1e-6,           # Much lower LR for financial stability
            "fine_tune_steps": 200,         # Fewer steps to prevent overfitting
            "ag_args": {"name_suffix": "FT_lr1e-6_steps200"}
        },
        {
            "model_path": "bolt_base",
            "fine_tune": True,
            "fine_tune_lr": 5e-6,           # Slightly higher but still conservative
            "fine_tune_steps": 300,
            "ag_args": {"name_suffix": "FT_lr5e-6_steps300"}
        },
        {
            "model_path": "bolt_base",
            "fine_tune": True,
            "fine_tune_lr": 1e-7,           # Extremely conservative
            "fine_tune_steps": 500,         # More steps but tiny learning rate
            "ag_args": {"name_suffix": "FT_lr1e-7_steps500"}
        },

        # Try bolt_small fine-tuning (smaller model, less overfitting risk)
        {
            "model_path": "bolt_small",
            "fine_tune": True,
            "fine_tune_lr": 1e-6,
            "fine_tune_steps": 400,
            "ag_args": {"name_suffix": "FT_lr1e-6_steps400"}
        }
    ]
}



In [51]:
def prepare_ts_frame(df, ticker):
  data = df.copy()
  data['item_id'] = ticker

  ts_data = TimeSeriesDataFrame.from_data_frame(
      df=data,
      id_column='item_id',  # If you have multiple time series, otherwise use a constant
      timestamp_column="date",
  )

  ts_data = ts_data.convert_frequency(freq="B")
  return ts_data



def train_test_split(df):
  train_data, test_data = ts_data.train_test_split(prediction_length)
  return train_data, test_data



def get_scores_matrix(frame, predictor, train_data, test_data):
  models = predictor.leaderboard(test_data)['model'].values
  scores = {'WAPE': [], 'MAPE': [], 'MAE': []}
  y_hats = {}
  for m in models:
    predictions = predictor.predict(
        train_data,
        predictor.make_future_data_frame(train_data),
        model = m
        )

    mape = mean_absolute_percentage_error(frame.tail(SHIFT)['close'], predictions['mean'])
    wape = np.average(np.abs(frame.tail(SHIFT)['close'].values - predictions['mean'].values) / frame.tail(SHIFT)['close'].values)
    mae = mean_absolute_error(frame.tail(SHIFT)['close'], predictions['mean'])

    scores['MAPE'].append(mape)
    scores['WAPE'].append(wape)
    scores['MAE'].append(mae)
    y_hats[m] = predictions['mean'].values

  metrics = pd.DataFrame(scores, index=models)
  return metrics, y_hats

In [None]:
all_scores = []
all_predictions = {}
for key in frames.keys():
  print(f'Testing for Period: {key}')

  ts_data = prepare_ts_frame(frames[key], ticker='SPY')

  train_data, test_data = ts_data.train_test_split(prediction_length)

  predictor = TimeSeriesPredictor(
      prediction_length=prediction_length,
      target=target,
      # known_covariates_names=features,
      freq= "B" ,
      eval_metric = EVAL_METRIC
  ).fit(
      train_data,
      hyperparameters=hyperparameters,
      enable_ensemble=False,
      time_limit=TIME_LIMIT,
      verbosity=1
  )

  scores, predictions = get_scores_matrix(frames[key], predictor, train_data, test_data)
  scores['period'] = key
  display(scores)
  all_scores.append(scores)
  all_predictions[key] = predictions


Testing for Period: L5Y


Unnamed: 0,WAPE,MAPE,MAE,period
AutoARIMA,0.020763,0.020763,11.925995,L5Y
ChronosZeroShot[bolt_small],0.02407,0.02407,13.816266,L5Y
ChronosFT_lr1e-7_steps500[bolt_base],0.024214,0.024214,13.887256,L5Y
ChronosFT_lr1e-6_steps200[bolt_base],0.024943,0.024943,14.297986,L5Y
ChronosFT_lr1e-6_steps400[bolt_small],0.026371,0.026371,15.112329,L5Y
ChronosZeroShot[bolt_base],0.026763,0.026763,15.329431,L5Y
ChronosFT_lr5e-6_steps300[bolt_base],0.027496,0.027496,15.734125,L5Y


Testing for Period: L10Y


Unnamed: 0,WAPE,MAPE,MAE,period
ChronosFT_lr1e-6_steps200[bolt_base],0.014926,0.014926,8.606885,L10Y
ChronosFT_lr1e-7_steps500[bolt_base],0.018153,0.018153,10.450104,L10Y
AutoARIMA,0.019252,0.019252,11.064436,L10Y
ChronosFT_lr5e-6_steps300[bolt_base],0.02119,0.02119,12.158154,L10Y
ChronosFT_lr1e-6_steps400[bolt_small],0.021893,0.021893,12.569562,L10Y
ChronosZeroShot[bolt_small],0.023035,0.023035,13.226172,L10Y
ChronosZeroShot[bolt_base],0.026408,0.026408,15.131781,L10Y


Testing for Period: L15Y


In [None]:
metrics = pd.concat(all_scores).reset_index()

metrics.to_csv('/content/drive/MyDrive/thesis_nova_2025/metrics_ch_bolt_0610_10d.csv')

metrics