In [None]:
# # Toggle ChatML for ART calls (Qwen)
# import os

# # Set to True to enable ChatML formatting in the ART call path, False to disable
# ART_USE_CHATML = True
# os.environ["ART_USE_CHATML"] = "1" if ART_USE_CHATML else "0"
# print("ART_USE_CHATML =", os.environ["ART_USE_CHATML"]) 


# Finance Autocomplete RL Training

This notebook trains a small language model to perform financial data autocomplete using reinforcement learning with tool calls.

The model learns to:
1. Use financial tools (search, calculate) to retrieve and compute data
2. Complete text with accurate financial information
3. Determine when no completion is needed

Rewards combine LLM-as-judge correctness with tool-use bonuses (used search, required lookup coverage, and per-dimension ticker/metric/period correctness).

Training uses the ART (Adaptive Reinforcement Training) framework with PPO.

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

import psutil

ram_gb = psutil.virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

In [None]:
%%capture
# Install required packages
!uv pip install openpipe-art==0.3.11.post3 "gql<4" --prerelease allow --no-cache-dir
!uv pip install openpipe aiosqlite httpx tqdm python-dotenv

In [None]:
# Clone the repository to get the Python modules
!git clone https://github.com/tmychow/financial-autocomplete.git /content/financial-autocomplete 2>/dev/null || true
import sys
sys.path.insert(0, '/content/financial-autocomplete')

In [None]:
import os
from dotenv import load_dotenv

# Load .env file if it exists
load_dotenv()

# Required: OpenAI API key for LLM-as-judge evaluation
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")  # @param {type:"string"}
if OPENAI_API_KEY:
    os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
else:
    print("Warning: No OpenAI API key provided. Judge evaluation will fall back to simple string matching.")

# Required: Tiingo API for financial data
TIINGO_API_KEY = os.getenv("TIINGO_API_KEY", "")  # @param {type:"string"}
if TIINGO_API_KEY:
    os.environ["TIINGO_API_KEY"] = TIINGO_API_KEY
else:
    raise ValueError("TIINGO_API_KEY is required. Get a free key at https://api.tiingo.com")

# Optional: Weights & Biases for metrics tracking
WANDB_API_KEY = os.getenv("WANDB_API_KEY", "")  # @param {type:"string"}
if WANDB_API_KEY:
    os.environ["WANDB_API_KEY"] = WANDB_API_KEY

In [None]:
# Training configuration
NUM_STEPS = 80  # @param {type:"integer"}
NUM_CASES_PER_STEP = 20  # @param {type:"integer"}
NUM_ROLLOUTS_PER_CASE = 5  # @param {type:"integer"}
NUM_VALIDATION_CASES = 50  # @param {type:"integer"}
VALIDATION_FREQUENCY = 5  # @param {type:"integer"}

# Model configuration
MODEL_NAME = "finance_autocomplete_model_v14"  # @param {type:"string"}
BASE_MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"  # @param {type:"string"}
VAL_BENCHMARK_MODEL = "gpt-4.1-mini"  # @param {type:"string"}

# Define baseline models to compare
baseline_models = [
    ("gpt-4.1-mini", "gpt-4.1-mini"),
]

# Training hyperparameters
LEARNING_RATE = 5e-5  # @param {type:"number"}
BETA = 0.0  # @param {type:"number"}

# Reward configuration
USE_JUDGE = OPENAI_API_KEY != ""  # Use LLM judge if API key provided
JUDGE_MODEL = os.getenv("JUDGE_MODEL", "gpt-4.1")  # @param {type:"string"}
os.environ["JUDGE_MODEL"] = JUDGE_MODEL

# Logging configuration
import os
import time
LOG_DIR = "./logs"
os.makedirs(LOG_DIR, exist_ok=True)
RUN_TS = time.strftime("%Y%m%d_%H%M%S")
TRAIN_LOG_PATH = os.path.join(LOG_DIR, f"train_{RUN_TS}.jsonl")
VAL_LOG_PATH = os.path.join(LOG_DIR, f"validation_{RUN_TS}.jsonl")
print(f"Logs will be written to:\n  {TRAIN_LOG_PATH}\n  {VAL_LOG_PATH}")

print(f"Configuration:")
print(f"  Training steps: {NUM_STEPS}")
print(f"  Cases per step: {NUM_CASES_PER_STEP}")
print(f"  Rollouts per case: {NUM_ROLLOUTS_PER_CASE}")
print(f"  Total trajectories per step: {NUM_CASES_PER_STEP * NUM_ROLLOUTS_PER_CASE}")
print(f"  Using LLM judge: {USE_JUDGE}")
print(f"  Judge model: {JUDGE_MODEL}")
print(f"  Using Tiingo data: {bool(TIINGO_API_KEY)}")

In [None]:
import os
os.environ["TRAIN_LOG_PATH"] = TRAIN_LOG_PATH
os.environ["VAL_LOG_PATH"] = VAL_LOG_PATH


In [None]:
# Import all modules
import asyncio
import time
from typing import List, Dict, Any
from tqdm.asyncio import tqdm_asyncio

import art

# Import our finance modules
from database import setup_database, get_tickers_with_data
from synthetic import generate_cases, generate_training_data
from environment import FinancialEnvironment
from agent import AutocompleteAgent
from rewards import calculate_reward
from rollout import (
    run_single_rollout,
    conduct_rollouts,
    run_validation
)

# Reward weighting (optional overrides)
os.environ["W_USED_SEARCH"] = os.getenv("W_USED_SEARCH", "0.1")
os.environ["W_TICKER"] = os.getenv("W_TICKER", "0.0667")
os.environ["W_METRIC"] = os.getenv("W_METRIC", "0.0667")
os.environ["W_PERIOD"] = os.getenv("W_PERIOD", "0.0666")

## Initialize Database

In [None]:
# Setup the financial database with Tiingo data
await setup_database()  # Uses TIINGO_API_KEY from environment

# Verify data
tickers = await get_tickers_with_data()
print(f"\nDatabase contains data for {len(tickers)} tickers: {tickers}")

## Generate Test Data

In [None]:
# Generate some sample test cases to verify everything works
sample_cases = await generate_cases(5)
print("Sample test cases:")
for i, case in enumerate(sample_cases, 1):
    print(f"\n{i}. Input: {case['input']}")
    print(f"   Expected: {case['ground_truth']}")

## Create Models

In [None]:
# Create the trainable model
from art.local import LocalBackend

model = art.TrainableModel(
    name=MODEL_NAME,
    project="finance-autocomplete-rl",
    base_model=BASE_MODEL_NAME,
)

# Create benchmark model for validation
benchmark_model = art.Model(
    name=VAL_BENCHMARK_MODEL,
    project="finance-autocomplete-rl",
    inference_model_name=VAL_BENCHMARK_MODEL,
    inference_api_key=os.getenv("OPENAI_API_KEY"),
    inference_base_url="https://api.openai.com/v1",
)

# Setup backend
backend = LocalBackend(path="./.art")

# Register models
await model.register(backend)
await benchmark_model.register(backend)

print(f"Models registered:")
print(f"  Training model: {MODEL_NAME}")
print(f"  Benchmark model: {VAL_BENCHMARK_MODEL}")

## Test Single Rollout

Before training, let's test a single rollout to ensure everything works.

In [None]:
from textwrap import fill

def _iter_turns(traj, include_system=False):
    # Support either a list or an object with .messages_and_choices
    items = getattr(traj, "messages_and_choices", traj)

    for it in items:
        # Plain dict like {"role": "...", "content": "..."}
        if isinstance(it, dict) and "role" in it and "content" in it:
            role, content = it["role"], it.get("content", "")
        # OpenAI-style Choice with .message.role / .message.content
        elif hasattr(it, "message") and hasattr(it.message, "content"):
            role = getattr(it.message, "role", "assistant")
            content = it.message.content or ""
        # Fallback
        else:
            role, content = "other", str(it)

        if (role == "system" and not include_system):
            continue
        yield role, content

def print_conversation(traj, width=100, truncate=0):
    for i, (role, content) in enumerate(_iter_turns(traj), 1):
        text = content.strip()
        if truncate and len(text) > truncate:
            text = text[:truncate].rstrip() + " …[truncated]"
        print(f"{i:02d} | {role.upper()}:")
        # Keep existing newlines, wrap each line for readability
        for line in text.splitlines() or [""]:
            print(fill(line, width=width, subsequent_indent="    "))
        print()

In [None]:
# Test a single rollout
test_case = sample_cases[0]
print(f"Testing rollout with: {test_case['input']}")
print(f"Expected: {test_case['ground_truth']}")

result = await run_single_rollout(
    model=model,
    test_case=test_case,
    rollout_id=0,
    step=0,
    use_judge=USE_JUDGE
)

if result["success"]:
    print(f"\nPrediction: {result['completion']}")
    print(f"Reward: {result['reward_info']['total_reward']:.3f}")
    print(f"  - Correctness: {result['reward_info']['correctness_score']:.3f}")
    print(f"Tool calls: {result['episode_info']['tool_calls_count']}")
    traj = result['trajectory']
    print_conversation(traj)
else:
    print(f"Error: {result.get('error')}")

## Training Loop

In [None]:
# Main training loop
for step in range(await model.get_step(), NUM_STEPS):
    print(f"\n{'='*60}")
    print(f"Step {step}/{NUM_STEPS}")
    print(f"{'='*60}")

    # Determine curriculum stage from step (override with CURRICULUM_STAGE if set)
    curriculum_stage = 0 if step < 20 else (1 if step < 40 else (2 if step < 60 else 3))
    print(f"Curriculum stage: {curriculum_stage}")

    # Generate training cases for this step using curriculum
    train_cases = await generate_cases(NUM_CASES_PER_STEP, curriculum_stage=curriculum_stage)

    # Conduct rollouts
    trajectory_groups = await conduct_rollouts(
        model=model,
        test_cases=train_cases,
        num_rollouts_per_case=NUM_ROLLOUTS_PER_CASE,
        step=step,
        use_judge=USE_JUDGE,
        judge_model=JUDGE_MODEL,
        log_path=TRAIN_LOG_PATH,
    )
    
    # Calculate training metrics
    if trajectory_groups:
        total_trajectories = sum(len(tg.trajectories) for tg in trajectory_groups)
        avg_reward = sum(t.reward for tg in trajectory_groups for t in tg.trajectories) / total_trajectories
        avg_correct = sum(t.metrics.get("is_correct", 0) for tg in trajectory_groups for t in tg.trajectories) / total_trajectories
        
        print(f"Training metrics:")
        print(f"  Trajectories: {total_trajectories}")
        print(f"  Avg reward: {avg_reward:.3f}")
        print(f"  Accuracy: {avg_correct:.1%}")
    
    # Run validation periodically
    if step % VALIDATION_FREQUENCY == 0 and USE_JUDGE:
        print(f"\nRunning validation...")
        val_trajectories = await run_validation(
            my_model=model,
            benchmark_model=benchmark_model,
            num_validation_cases=NUM_VALIDATION_CASES,
            step=step,
            use_judge=USE_JUDGE,
            judge_model=JUDGE_MODEL,
            log_path=VAL_LOG_PATH,
        )
        
        if val_trajectories:
            win_rate = sum(t.reward for t in val_trajectories) / len(val_trajectories)
            print(f"Validation win rate vs {VAL_BENCHMARK_MODEL}: {win_rate:.1%}")
            
            # Log validation trajectories
            await model.log(val_trajectories)
    
    # Train the model
    if trajectory_groups:
        await model.train(
            trajectory_groups=trajectory_groups,
            config=art.TrainConfig(
                learning_rate=LEARNING_RATE,
                beta=BETA
            )
        )
        
        # Clean up old checkpoints
        await model.delete_checkpoints()
        
        print(f"✓ Step {step} completed")
    else:
        print(f"⚠ No trajectories generated for step {step}")

print(f"\n{'='*60}")
print(f"Training completed!")
print(f"{'='*60}")

## Benchmark Against Other Models

In [None]:
# Compare final model against various baselines
if USE_JUDGE:
    print("Benchmarking against baseline models...")
    
    results = {}
    
    for model_id, model_name in baseline_models:
        baseline_model = art.Model(
            name=model_id,
            project="finance-autocomplete-rl",
            inference_model_name=model_name,
            inference_api_key=os.getenv("OPENAI_API_KEY"),
            inference_base_url="https://api.openai.com/v1",
        )
        
        await baseline_model.register(backend)
        
        # Run evaluation
        val_trajectories = await run_validation(
            my_model=baseline_model,
            benchmark_model=benchmark_model,
            num_validation_cases=NUM_VALIDATION_CASES,
            step=0,
            use_judge=USE_JUDGE,
            judge_model=JUDGE_MODEL,
        )
        
        if val_trajectories:
            win_rate = sum(t.reward for t in val_trajectories) / len(val_trajectories)
            avg_reward = sum(t.metrics.get("my_reward", 0) for t in val_trajectories) / len(val_trajectories)
            results[model_id] = {"win_rate": win_rate, "avg_reward": avg_reward}
            
            # Log for comparison
            await baseline_model.log(val_trajectories)
    
    # Display results
    print("\nBenchmark Results:")
    print(f"{'Model':<20} {'Win Rate':<15} {'Avg Reward':<15}")
    print("-" * 50)
    for model_id, metrics in results.items():
        print(f"{model_id:<20} {metrics['win_rate']:.1%}{'':.<8} {metrics['avg_reward']:.3f}")

## Visualization

In [None]:
# Load and visualize training progress
from art.utils.benchmarking.load_trajectories import load_trajectories
from art.utils.benchmarking.charts import training_progress_chart
from art.utils.benchmarking.types import BenchmarkModelKey

# Load trajectories
df = await load_trajectories(
    project_name="finance-autocomplete-rl",
    models=[MODEL_NAME],
    art_path="./.art",
)

# Plot win rate over time
if not df.empty and USE_JUDGE:
    models_to_plot = [
        BenchmarkModelKey(MODEL_NAME, MODEL_NAME, "val"),
    ]
    
    # Add baseline models if they were evaluated
    if 'results' in locals():
        for model_id, _ in baseline_models:
            if model_id in results:
                models_to_plot.append(BenchmarkModelKey(model_id, model_id.upper(), "val"))
    
    chart = training_progress_chart(
        df,
        "win_rate",
        models=models_to_plot,
        title="Win Rate vs Benchmark Over Time",
        y_label="Win Rate",
    )
    
    chart.savefig("finance_autocomplete_training.png")
    print("Training chart saved to finance_autocomplete_training.png")

## Test Final Model

In [None]:
# Test the final trained model on some examples
print("Testing final model on sample inputs...\n")

test_inputs = [
    "Apple's revenue in 2023 was ",
    "The gross margin for Microsoft in 2023Q4 was ",
    "Google's market cap in 2023Q4 was ",
    "The debt to equity ratio for Apple in 2023FY was ",
    "The CFO mentioned that ",
]

agent = AutocompleteAgent(model=model)

for input_text in test_inputs:
    print(f"Input: {input_text}")
    
    completion, tool_calls, info = await agent.get_completion(input_text)
    
    print(f"Completion: {completion}")
    print(f"Tool calls: {info['tool_calls_count']}")
    
    if tool_calls and info['tool_calls_count'] <= 10:
        print("Tool sequence:")
        for tc in tool_calls[:10]:  # Show first 10 tools
            args_str = ", ".join(f"{k}={v}" for k, v in tc.get('arguments', {}).items())
            print(f"  - {tc['tool']}({args_str})")
    
    print()

In [None]:
# Save training configuration and results
import json

training_info = {
    "model_name": MODEL_NAME,
    "base_model": BASE_MODEL_NAME,
    "num_steps": NUM_STEPS,
    "cases_per_step": NUM_CASES_PER_STEP,
    "rollouts_per_case": NUM_ROLLOUTS_PER_CASE,
    "learning_rate": LEARNING_RATE,
    "beta": BETA,
    "use_judge": USE_JUDGE,
    "judge_model": JUDGE_MODEL,
}

with open("training_info.json", "w") as f:
    json.dump(training_info, f, indent=2)

print("Training information saved to training_info.json")
print(f"\nModel checkpoint available at: ./.art/{MODEL_NAME}")