# 🧪 Baseline Model Comparison for Electronics Price Prediction

In this notebook, we compare several baseline models for predicting product prices from prompt-style text inputs.  
The goal is to establish strong non-LLM baselines before evaluating or fine-tuning large language models.

### 🧪 Models Included
- 🎲 **Random Price Predictor**
- 📊 **Constant (Average Price) Predictor**
- 📦 **Bag of Words + Linear Regression**
- 🔠 **Word2Vec + Linear Regression**
- 📈 **Word2Vec + Support Vector Regressor (SVR)**
- 🌲 **Random Forest Regressor**


Each model is evaluated using:
- Absolute error
- RMSLE (log-based error)
- Hit rate (how often predictions fall within an acceptable error range)

Results are visualized using scatter plots comparing model predictions vs ground truth prices from test set.

> This provides a clear performance baseline before bringing in LLM-based approaches like LLaMA.


💡 **Tip:** In Colab, selecting a GPU like the **L4** gives you access to a **powerful CPU** as well (e.g., 12 cores).
Even if you don’t need the GPU, the extra CPU power can significantly speed up our experiments.


In [None]:
import os
os.cpu_count()


In [None]:
!pip install -q datasets

In [None]:
# imports

import math
import json
import random
from huggingface_hub import login
import matplotlib.pyplot as plt
import numpy as np
import pickle
from collections import Counter
from google.colab import userdata
from datasets import load_dataset, Dataset, DatasetDict

In [None]:
# More imports for our traditional machine learning

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [None]:
# NLP related imports

from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

In [None]:
# Finally, more imports for more advanced machine learning

from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Constants - used for printing to stdout in color

GREEN = "\033[92m"
YELLOW = "\033[93m"
RED = "\033[91m"
RESET = "\033[0m"
COLOR_MAP = {"red":RED, "orange": YELLOW, "green": GREEN}

HF_USER = "vassilis19" # your HF name here! Or use mine if you just want to reproduce my results.

# Dataset
DATASET_NAME = f"{HF_USER}/pricer-electronics-data"
REVISION = "701eba81570388cfd60924c6fe144b27491a9ec0"

In [None]:
# Log in to HuggingFace

hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [None]:
%matplotlib inline

📥 Load the dataset from the Hugging Face Hub (specific commit revision for reproducibility)


In [None]:
dataset = load_dataset(DATASET_NAME, revision = REVISION)
train = dataset['train']
test = dataset['test']

In [None]:
# Remind ourselves the testing element

test[0]

In [None]:
# Remind a training prompt price

print(train[0]["price"])

## Unveiling a mighty script that we will use a lot!

A rather pleasing Test Harness that will evaluate any model against 250 items from the Test set

And show us the results in a visually satisfying way.

You write a function of this form:

```
def my_prediction_function(item):
    # my code here
    return my_estimate
```

And then you call:

`Tester.test(my_prediction_function, test_dataset)`

To evaluate your model.

In [None]:
class Tester:
    # Initialize the tester with a predictor function, dataset, optional title, and sample size
    def __init__(self, predictor, data, title=None, size=250):
        self.predictor = predictor
        self.data = data
        self.title = title or predictor.__name__.replace("_", " ").title()
        self.size = size  # Number of datapoints to test
        self.guesses = []  # Model predictions
        self.truths = []   # Ground truth prices
        self.errors = []   # Absolute errors
        self.sles = []     # Squared log errors
        self.colors = []   # Color codes for visualization

    # Determine color based on error severity for visualization
    def color_for(self, error, truth):
        if error < 40 or error / truth < 0.2:
            return "green"
        elif error < 80 or error / truth < 0.4:
            return "orange"
        else:
            return "red"

    # Run prediction and error calculation for a single datapoint
    def run_datapoint(self, i):
        datapoint = self.data[i]
        guess = self.predictor(datapoint["text"])  # Run the model
        truth = datapoint["price"]  # True price
        error = abs(guess - truth)  # Absolute error
        log_error = math.log(truth + 1) - math.log(guess + 1)  # Log error
        sle = log_error ** 2  # Squared log error
        color = self.color_for(error, truth)  # Color for this point
        title = datapoint["text"].split("\n\n")[1][:20] + "..."  # Short title snippet for display
        # Record values for reporting
        self.guesses.append(guess)
        self.truths.append(truth)
        self.errors.append(error)
        self.sles.append(sle)
        self.colors.append(color)
        # Print detailed result for the datapoint
        print(f"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:,.2f} Item: {title}{RESET}")

    # Create a scatter plot of predictions vs ground truth
    def chart(self, title):
        max_error = max(self.errors)
        plt.figure(figsize=(12, 8))
        max_val = max(max(self.truths), max(self.guesses))
        plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6)  # Diagonal line
        plt.scatter(self.truths, self.guesses, s=3, c=self.colors)  # Plot points
        plt.xlabel('Ground Truth')
        plt.ylabel('Model Estimate')
        plt.xlim(0, max_val)
        plt.ylim(0, max_val)
        plt.title(title)
        plt.show()

    # Report metrics: average error, RMSLE, and hit rate
    def report(self):
        average_error = sum(self.errors) / self.size
        rmsle = math.sqrt(sum(self.sles) / self.size)
        hits = sum(1 for color in self.colors if color == "green")
        title = f"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits/self.size*100:.1f}%"
        self.chart(title)

    # Run the full evaluation loop
    def run(self):
        self.error = 0
        for i in range(self.size):
            self.run_datapoint(i)
        self.report()

    # Convenience method to run a test directly
    @classmethod
    def test(cls, function, data):
        cls(function, data).run()


# Now for something basic

What's the very simplest model you could imagine?

Let's start with a random number generator!

In [None]:
def random_pricer(item, test):
    return random.randrange(1,1000)

In [None]:
# Set the random seed
random.seed(42)

# Run our TestRunner
Tester.test(random_pricer, test)

In [None]:
# That was fun!
# We can do better - here's another rather trivial model

training_prices = [item["price"] for item in train]
training_average = sum(training_prices) / len(training_prices)

def constant_pricer(item):
    return training_average

In [None]:
# Run our constant predictor
Tester.test(constant_pricer, test)

In [None]:
# For the next few models, we prepare our documents and prices
# Note that we use the test prompt for the documents, otherwise we'll reveal the answer!!

prices = np.array([float(item["price"]) for item in train])
documents = [item["text"] for item in train]


In [None]:
# Use the CountVectorizer for a Bag of Words model

np.random.seed(42)
vectorizer = CountVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(documents)
regressor = LinearRegression()
regressor.fit(X, prices)

In [None]:
def bow_lr_pricer(text):
    x = vectorizer.transform([text])  # text is a string (the prompt)
    return max(regressor.predict(x)[0], 0)


In [None]:
# test it

Tester.test(bow_lr_pricer, test)

In [None]:
# The amazing word2vec model, implemented in gensim NLP library

np.random.seed(42)

# Preprocess the documents
processed_docs = [simple_preprocess(doc) for doc in documents]

# Train Word2Vec model
w2v_model = Word2Vec(sentences=processed_docs, vector_size=400, window=5, min_count=1, workers=8)

In [None]:
# This step of averaging vectors across the document is a weakness in our approach

def document_vector(doc):
    doc_words = simple_preprocess(doc)
    word_vectors = [w2v_model.wv[word] for word in doc_words if word in w2v_model.wv]
    return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(w2v_model.vector_size)

# Create feature matrix
X_w2v = np.array([document_vector(doc) for doc in documents])

In [None]:
# Run Linear Regression on word2vec

word2vec_lr_regressor = LinearRegression()
word2vec_lr_regressor.fit(X_w2v, prices)

In [None]:
def word2vec_lr_pricer(text):
    doc_vector = document_vector(text)  # `text` is already the prompt string
    return max(0, word2vec_lr_regressor.predict([doc_vector])[0])


In [None]:
Tester.test(word2vec_lr_pricer, test)

In [None]:
# Support Vector Machines

np.random.seed(42)
svr_regressor = LinearSVR()

svr_regressor.fit(X_w2v, prices)

In [None]:
def svr_pricer(text):
    np.random.seed(42)
    doc_vector = document_vector(text)
    return max(float(svr_regressor.predict([doc_vector])[0]), 0)


In [None]:
Tester.test(svr_pricer, test)

In [None]:
# And the powerful Random Forest regression

rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=8)
rf_model.fit(X_w2v, prices)

In [None]:
def random_forest_pricer(text):
    doc_vector = document_vector(text)
    return max(0, rf_model.predict([doc_vector])[0])


In [None]:
Tester.test(random_forest_pricer, test)