# 🤖 Frontier Model Evaluation for Electronics Price Prediction

## 📊 Comparing the Accuracy of Cutting-Edge Language Models

In this notebook, we evaluate the performance of advanced Frontier models on the task of predicting electronics prices based on structured prompt inputs.  
The models tested include:

- 🔹 GPT-4o-mini  
- 🔹 GPT-4o  
- 🔹 Claude 3.7 Sonnet  
- 🔹 Google Gemini 1.5 Flash  
- 🔹 Google Gemini 2.0 Flash  
- 🔹 DeepSeek V3

Each model receives a consistent system prompt and is tested on the same input format to ensure fair comparison.  
Performance is assessed using metrics such as absolute error, RMSLE, and hit rate.

---


### ⚙️ Runtime Info

This notebook runs on a **simple CPU runtime**, as it only queries external APIs and doesn't require GPU acceleration.

---

### ⚠️ Important Notes

It's important to appreciate that **we are not training the Frontier models**.  
We're only providing them with the **test dataset** to observe how well they perform.  
They do **not** receive the 400,000 training examples we used for the traditional ML models.

**That said...**

Given the massive scale of their training data, it's entirely possible these models have already seen some (or even all) of the products from both the training and test sets.  
This could lead to **test contamination**, giving them an **unfair advantage**.  
We should keep this possibility in mind when interpreting the results.

This notebook runs on a simple cpu


In [None]:
!pip install -q datasets anthropic

In [None]:
# imports

import os
import re
import math
import json
import random
from huggingface_hub import login
import matplotlib.pyplot as plt
import numpy as np
import pickle
from collections import Counter
from openai import OpenAI
from anthropic import Anthropic
from google.colab import userdata
from datasets import load_dataset, Dataset, DatasetDict

In [None]:
# Constants - used for printing to stdout in color
GREEN = "\033[92m"
YELLOW = "\033[93m"
RED = "\033[91m"
RESET = "\033[0m"
COLOR_MAP = {"red":RED, "orange": YELLOW, "green": GREEN}

# Dataset
HF_USER = "vassilis19" # your HF name here! Or use mine if you just want to reproduce my results.
DATASET_NAME = f"{HF_USER}/pricer-electronics-data"
REVISION = "701eba81570388cfd60924c6fe144b27491a9ec0"

In [None]:
# 🔐 Load secret API keys from Colab's environment (must be added via the 🔑 "Secrets" panel)
hf_token = userdata.get('HF_TOKEN')
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
ANTHROPIC_API_KEY = userdata.get('ANTHROPIC_API_KEY')
google_api_key = userdata.get('GOOGLE_API_KEY')
deepseek_api_key = userdata.get('DEEPSEEK_API_KEY')

login(hf_token, add_to_git_credential=True)

In [None]:
# 🔗 Initialize OpenAI-compatible clients for third-party models
openai = OpenAI(api_key= OPENAI_API_KEY)
claude = Anthropic(api_key = ANTHROPIC_API_KEY)
gemini_via_openai_client = OpenAI(                                   # Google Gemini (via OpenAI-compatible client)
    api_key=google_api_key,
    base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
)
deepseek_via_openai_client = OpenAI(                                 # DeepSeek (via OpenAI-compatible client)
    api_key=deepseek_api_key,
    base_url="https://api.deepseek.com"
)

In [None]:
# 📥 Load the dataset from the Hugging Face Hub (using a specific revision for reproducibility)
dataset = load_dataset(DATASET_NAME, revision = REVISION)
train = dataset['train']
test = dataset['test']

📊 Tester class for evaluating price prediction models. Accepts a prediction function and a dataset, runs evaluation over a subset of items, calculates error metrics (absolute error, RMSLE), and visualizes predictions vs actual prices:


In [None]:
class Tester:
    # Initialize the tester with a predictor function, dataset, optional title, and sample size
    def __init__(self, predictor, data, title=None, size=250):
        self.predictor = predictor
        self.data = data
        self.title = title or predictor.__name__.replace("_", " ").title()
        self.size = size  # Number of datapoints to test
        self.guesses = []  # Model predictions
        self.truths = []   # Ground truth prices
        self.errors = []   # Absolute errors
        self.sles = []     # Squared log errors
        self.colors = []   # Color codes for visualization

    # Determine color based on error severity for visualization
    def color_for(self, error, truth):
        if error < 40 or error / truth < 0.2:
            return "green"
        elif error < 80 or error / truth < 0.4:
            return "orange"
        else:
            return "red"

    # Run prediction and error calculation for a single datapoint
    def run_datapoint(self, i):
        datapoint = self.data[i]
        guess = self.predictor(datapoint["text"])  # Run the model
        truth = datapoint["price"]  # True price
        error = abs(guess - truth)  # Absolute error
        log_error = math.log(truth + 1) - math.log(guess + 1)  # Log error
        sle = log_error ** 2  # Squared log error
        color = self.color_for(error, truth)  # Color for this point
        title = datapoint["text"].split("\n\n")[1][:20] + "..."  # Short title snippet for display
        # Record values for reporting
        self.guesses.append(guess)
        self.truths.append(truth)
        self.errors.append(error)
        self.sles.append(sle)
        self.colors.append(color)
        # Print detailed result for the datapoint
        print(f"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:,.2f} Item: {title}{RESET}")

    # Create a scatter plot of predictions vs ground truth
    def chart(self, title):
        max_error = max(self.errors)
        plt.figure(figsize=(12, 8))
        max_val = max(max(self.truths), max(self.guesses))
        plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6)  # Diagonal line
        plt.scatter(self.truths, self.guesses, s=3, c=self.colors)  # Plot points
        plt.xlabel('Ground Truth')
        plt.ylabel('Model Estimate')
        plt.xlim(0, max_val)
        plt.ylim(0, max_val)
        plt.title(title)
        plt.show()

    # Report metrics: average error, RMSLE, and hit rate
    def report(self):
        average_error = sum(self.errors) / self.size
        rmsle = math.sqrt(sum(self.sles) / self.size)
        hits = sum(1 for color in self.colors if color == "green")
        title = f"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits/self.size*100:.1f}%"
        self.chart(title)

    # Run the full evaluation loop
    def run(self):
        self.error = 0
        for i in range(self.size):
            self.run_datapoint(i)
        self.report()

    # Convenience method to run a test directly
    @classmethod
    def test(cls, function, data):
        cls(function, data).run()


In [None]:
%matplotlib inline

## First, the GPT-4o-mini

It's called mini, but it packs a punch.

In [None]:
# First let's work on a good prompt for a Frontier model
# Notice that I'm removing the " to the nearest dollar"
# When we train our own models, we'll need to make the problem as easy as possible,
# but a Frontier model needs no such simplification.


def messages_for(item):
    # Instruction for the model
    system_message = "You estimate prices of items. Reply only with the price, no explanation."

    # Use the 'text' field from the dictionary and clean it up
    user_prompt = item["text"].replace(" to the nearest dollar", "").replace("\n\nPrice is $", "")

    return [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_prompt},
        {"role": "assistant", "content": "Price is $"}
    ]



In [None]:
# 🧪 Try generating a message prompt for the first test example
messages_for(test[0])


In [None]:
# A utility function to extract the price from a string

def get_price(s):
    s = s.replace('$','').replace(',','')
    match = re.search(r"[-+]?\d*\.\d+|\d+", s)
    return float(match.group()) if match else 0

In [None]:
get_price("The price is roughly $99.99 because blah blah")

In [None]:
def gpt_4o_mini(text):
    # Wrap the string back into a dict-like format for `messages_for`
    dummy_item = {"text": text}
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages_for(dummy_item),
        seed=42,
        max_tokens=5
    )
    reply = response.choices[0].message.content.strip()
    return get_price(reply)

In [None]:
Tester.test(gpt_4o_mini, test)

## GPT-4o

In [None]:
def gpt_4o_frontier(text):
    dummy_item = {"text": text}
    response = openai.chat.completions.create(
        model="gpt-4o",
        messages=messages_for(dummy_item),
        seed=42,
        max_tokens=5
    )
    reply = response.choices[0].message.content.strip()
    return get_price(reply)

In [None]:
Tester.test(gpt_4o_frontier, test)

## claude 3-7 sonnet

In [None]:
def claude_3_point_7_sonnet(text):
    # Wrap text back into a dictionary to reuse messages_for()
    dummy_item = {"text": text}
    messages = messages_for(dummy_item)

    # Extract system message and user/assistant messages
    system_message = messages[0]['content']
    chat_messages = messages[1:]

    # Call Claude 3.7 Sonnet
    response = claude.messages.create(
        model="claude-3-7-sonnet-20250219",
        max_tokens=5,
        system=system_message,
        messages=chat_messages
    )

    # Extract and return numeric prediction
    reply = response.content[0].text.strip()
    return get_price(reply)


In [None]:
# The function for Claude 3.7 Sonnet
# It also cost me about 15 cents to run this (pricing may vary by region)
# You can skip this and look at my results instead

Tester.test(claude_3_point_7_sonnet, test)

## Gemini-1.5-flash

In [None]:
import time

def gemini_frontier(text):
    time.sleep(4)  # wait before each request to stay under rate limit
    dummy_item = {"text": text}
    response = gemini_via_openai_client.chat.completions.create(
        model="gemini-1.5-flash",
        messages=messages_for(dummy_item),
        max_tokens=5
    )
    try:
        reply = response.choices[0].message.content
        return get_price(reply.strip()) if reply else 0.0
    except Exception as e:
        print("⚠️ Gemini Error:", e)
        return 0.0

In [None]:
Tester.test(gemini_frontier, test)

## Gemini-2.0-flash

In [None]:
def gemini_2(text):
    time.sleep(6)  # wait before each request to stay under rate limit
    dummy_item = {"text": text}
    response = gemini_via_openai_client.chat.completions.create(
        model="gemini-2.0-flash-exp",
        messages=messages_for(dummy_item),
        max_tokens=5
    )
    try:
        reply = response.choices[0].message.content
        return get_price(reply.strip()) if reply else 0.0
    except Exception as e:
        print("⚠️ Gemini Error:", e)
        return 0.0

In [None]:
Tester.test(gemini_2, test)

## Deepseek-V3

In [None]:
def deepseek_frontier(text):
    dummy_item = {"text": text}
    response = deepseek_via_openai_client.chat.completions.create(
        model="deepseek-chat",
        messages=messages_for(dummy_item),
        seed=42,
        max_tokens=5
    )
    reply = response.choices[0].message.content.strip()
    return get_price(reply)

In [None]:
Tester.test(deepseek_frontier, test)