In [10]:
%load_ext autoreload
%autoreload 2

# 01 — BTC Return Forecasting: Quickstart Walk-Forward

This notebook runs the scaffolded **walk-forward** pipeline, creates synthetic data if missing, and visualizes results.

In [14]:
import os, sys, yaml, pandas as pd, numpy as np
from pathlib import Path

PROJECT_ROOT = Path.cwd()
print('PROJECT_ROOT:', PROJECT_ROOT)


PROJECT_ROOT: /Users/qinli/Documents/GitHub/BTC_price_prediction


In [16]:
sys.path.insert(0, str(PROJECT_ROOT))
from src.pipeline import run_pipeline
from src.utils import ensure_dir


## Load/prepare data

In [23]:
import src.data as dataLib

#df2 = dataLib.load_main('data/btc_daily.csv', timestamp_col = 'timestamp', price_col = 'price')
#df2.head()

In [50]:
cfg_path = PROJECT_ROOT / 'configs' / 'example_daily.yaml'
with open(cfg_path, 'r') as f:
    cfg = yaml.safe_load(f)

csv_path = PROJECT_ROOT / cfg['data']['main_csv']
csv_path.parent.mkdir(parents=True, exist_ok=True)

if not csv_path.exists():
    print('Creating synthetic BTC demo data at:', csv_path)
    dates = pd.date_range('2022-01-01', periods=900, freq='D', tz='UTC')
    price = 40000 + np.cumsum(np.random.randn(len(dates)) * 200)
    pd.DataFrame({'timestamp': dates.tz_convert('UTC').strftime('%Y-%m-%d'), 'price': price}).to_csv(csv_path, index=False)
else:
    print('Found BTC CSV at:', csv_path)

pd.read_csv(csv_path).head()

Found BTC CSV at: /Users/qinli/Documents/GitHub/BTC_price_prediction/data/btc_daily.csv


Unnamed: 0,timestamp,price
0,2012-01-02,5.0
1,2012-01-03,5.29
2,2012-01-04,5.57
3,2012-01-05,6.42
4,2012-01-06,6.4


## Tweak experiment budget for faster demo

In [53]:
cfg['windows']['train'] = max(300, cfg['windows']['train'])
cfg['windows']['val'] = 100
cfg['windows']['test'] = 100
cfg['windows']['step'] = 100
cfg['tuning']['n_candidates'] = 5


In [55]:
cfg

{'run': {'run_name': 'daily_demo', 'seed': 42, 'device': 'auto'},
 'data': {'main_csv': 'data/btc_daily.csv',
  'timestamp_col': 'timestamp',
  'price_col': 'price',
  'freq': 'D',
  'log_return': True,
  'target_horizon': 1,
  'tz': 'UTC'},
 'features': {'lags': [1, 2, 3, 5, 10, 20],
  'sma_windows': [5, 10, 20, 50],
  'ema_windows': [5, 10, 20],
  'rsi_windows': [14],
  'macd': {'fast:12': None, 'slow:26': None, 'signal:9': None},
  'vol_windows': [5, 10, 20],
  'dropna_after_build': True},
 'exogenous': {'enabled': False, 'assets': []},
 'windows': {'train': 500, 'val': 100, 'test': 100, 'step': 100},
 'models': {'classical': [{'name': 'ridge',
    'params': {'alpha': [0.1, 1.0, 10.0, 100.0]}},
   {'name': 'random_forest',
    'params': {'n_estimators': [200, 400, 800],
     'max_depth': [5, 10, 20, None],
     'min_samples_leaf': [1, 2, 4]}},
   {'name': 'xgboost',
    'params': {'n_estimators': [300, 600, 900],
     'max_depth': [3, 5, 8],
     'learning_rate': [0.01, 0.05, 0.1],


## Run pipeline

In [64]:
from src.pipeline import run_pipeline
from src.utils import ensure_dir

out_dir, leaderboard = run_pipeline(cfg)
print('Results folder:', out_dir)
leaderboard.head(10)

current model is ridge
{'r2': -2.0679900064468058, 'mse': 0.0045625931124115005, 'mae': 0.03246287925893428, 'directional': 0.48225}
current model is random_forest


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

{'r2': -0.42419892640640855, 'mse': 0.00211801218343975, 'mae': 0.02911404996135549, 'directional': 0.4755}
current model is xgboost
{'r2': -2.2082378154539644, 'mse': 0.004771164094084273, 'mae': 0.036414096335298844, 'directional': 0.47625}
current model is lstm


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


ValueError: Found input variables with inconsistent numbers of samples: [68, 36]

In [None]:
# Test individual models



In [15]:
leaderboard

Unnamed: 0,model
0,ridge
1,random_forest
2,xgboost
3,lstm
4,transformer


## Visualize predictions for top model

In [17]:
import pandas as pd, matplotlib.pyplot as plt
best = leaderboard.sort_values('r2', ascending=False).iloc[0]['model']
print('Top model:', best)
pred_csv = os.path.join(out_dir, f'predictions_{best}.csv')
pred = pd.read_csv(pred_csv, parse_dates=['timestamp']).dropna()

plt.figure()
plt.plot(pred['timestamp'], pred['y_true'], label='true')
plt.plot(pred['timestamp'], pred['y_pred'], label='pred')
plt.title(f'Pred vs True — {best}')
plt.xlabel('Time')
plt.legend()
plt.tight_layout()
plt.show()

NameError: name 'leaderboard' is not defined

## Expanding R² curve

In [None]:
from src.evaluation import expanding_r2
import matplotlib.pyplot as plt
r2c = expanding_r2(pred['y_true'].values, pred['y_pred'].values)
plt.figure()
plt.plot(r2c)
plt.title(f'Expanding R² — {best}')
plt.xlabel('Test observations')
plt.ylabel('R² (expanding)')
plt.tight_layout()
plt.show()