Current Model Notebook

In [None]:
# Create the dataset

import numpy as np
import pandas as pd
from datetime import datetime
import random as rand
from sklearn.feature_selection import SelectPercentile, f_regression
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
import torch
import json
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import joblib
import warnings
from transformers import logging
import joblib
from math import log

# Ignore specific warnings and logs
warnings.filterwarnings("ignore", category=FutureWarning)
logging.set_verbosity_error() 

# Read the file
df = pd.read_csv("training_cars_data.csv")

# Make Car Model into two different columns
if 'Car Model' in df.columns:
    df[['Year', 'Model']] = df['Car Model'].str.extract(r'(\d{4})\s(.*)')
    df = df.drop(columns=['Car Model'])

if 'Price (USD)' in df.columns:
    df = df.rename(columns={'Price (USD)': 'Price'})

if 'Dealer Name' in df.columns:
    df = df.rename(columns={'Dealer Name': 'Dealer'})

# Standardize all matching rows to 'Certified'
df['Condition'] = df['Condition'].apply(
    lambda x: 'Certified' if 'certified' in str(x).lower() else x
)
# Clean condition column
df['Condition'] = df['Condition'].replace([
    'Prequalify now', 'stock_type', 'New & Used', np.nan
], 'Other')

# List of columns to clean
columns_to_clean = ['Monthly Payment', 'Mileage', 'Price', 'Year']

# Apply cleaning function to all specified columns
df[columns_to_clean] = df[columns_to_clean].apply(lambda x: x.str.replace(r'\D', '', regex=True))

# Convert to numeric while preserving NaNs
df[columns_to_clean] = df[columns_to_clean].apply(pd.to_numeric, errors='coerce')

# Set empty Mileage values to 0
df['Mileage'] = df['Mileage'].fillna(0)

# Synthetic feature generation
def random_accident(row):
    return rand.choices([1, 0], weights=[30, 70], k=1)[0]  # 1 = accident, 0 = no accident

def random_owners(row):
    numbers = [0, 1, 2, 3, 4]
    weights = [40, 40, 30, 20, 10]
    return rand.choices(numbers, weights=weights, k=1)[0] 

def random_usage(row):
    return rand.choices([1, 0], weights=[80, 20], k=1)[0]  # 1 = personal use, 0 = not personal

# Apply the random features
df['Accidents'] = df.apply(random_accident, axis=1)
df['Owners'] = df.apply(random_owners, axis=1)
df['Usage'] = df.apply(random_usage, axis=1) 

# Feature engineering
current_year = datetime.now().year
df['Age'] = current_year - df['Year']
df['MilesPerYear'] = df['Mileage'] / (df['Age'] + 1)
df['PricePerMile'] = df['Price'] / (df['Mileage'] + 1)

# Hot-one encoding
df = pd.get_dummies(df, columns=['Condition', 'Model', 'Dealer'], drop_first=False)

# def simple_scoring(row):
#     if pd.isna(row['Mileage']) or pd.isna(row['Price']) or pd.isna(row['Year']):
#         return np.nan
    
#     age = current_year - row['Year']
#     price_score = row['Price'] / 2000  
#     mileage_score = row['Mileage'] / 2500    
#     age_score = age * 5                     
    
#     weighted_sum = (0.5 * price_score) + (0.4 * mileage_score) + (0.1 * age_score)
    
#     if row['Accidents'] == 1:
#         weighted_sum += 15  
    
#     if row['Owners'] == 2:
#         weighted_sum += 5
#     elif row['Owners'] > 2:
#         weighted_sum += 10
    
#     if row['Usage'] == 0:  # Commercial usage penalty
#         weighted_sum += 10
    
#     return weighted_sum

 
# df['Score'] = df.apply(simple_scoring, axis=1)

MEDIAN_PRICE = 25000  # median price baseline
MAX_MILEAGE = 250000
MAX_AGE = 30

def enhanced_scoring(row):
    # Check essential columns early
    if pd.isna(row.get('Mileage')) or pd.isna(row.get('Year')):
        return np.nan

    age = current_year - row['Year']
    if age < 0:
        age = 0  # avoid negative age if year is in future

    # Determine price_for_score
    price = row.get('Price')
    monthly_payment = row.get('Monthly Payment')

    if not pd.isna(price):
        price_for_score = price
    elif not pd.isna(monthly_payment):
        price_for_score = monthly_payment * 60  # estimate full price (5 years)
    else:
        return np.nan  # no price info at all

    # Normalize and clamp values
    clamped_mileage = min(row['Mileage'], MAX_MILEAGE)
    clamped_age = min(age, MAX_AGE)

    # Scores (log scale for price, power scale for others)
    price_score = np.log1p(price_for_score / MEDIAN_PRICE) * 100
    mileage_score = 100 * ((clamped_mileage / MAX_MILEAGE) ** 0.8)
    age_score = 100 * ((clamped_age / MAX_AGE) ** 0.7)

    weighted_sum = (0.55 * price_score) + (0.35 * mileage_score) + (0.10 * age_score)

    # Condition adjustment
    condition_new = row.get('Condition_New', 0) == 1
    condition_certified = row.get('Condition_Certified', 0) == 1
    condition_used = row.get('Condition_Used', 0) == 1

    if condition_new:
        weighted_sum -= 20
    elif condition_certified:
        weighted_sum -= 15
    elif not condition_used:
        weighted_sum += 5  # penalty for unknown condition

    # Accidents adjustment
    accidents = row.get('Accidents', 0)
    if accidents == 1:
        weighted_sum += 10
    elif accidents > 1:
        weighted_sum += 15  # harsher penalty for multiple accidents

    # Owners adjustment (penalties scale moderately)
    owners = row.get('Owners', 1)  # default 1 if missing
    if owners == 2:
        weighted_sum += 8
    elif owners > 2:
        weighted_sum += 8 + 3 * (owners - 2)  # scaled penalty

    # Usage adjustment
    usage = row.get('Usage', 1)  # assume 1 = non-commercial, 0 = commercial
    if usage == 0:
        # More penalty for commercial, scales with age
        weighted_sum += 15 * (1 + clamped_age / MAX_AGE)

    # Mileage per year adjustment
    years_used = age if age > 0 else 1
    miles_per_year = row['Mileage'] / years_used

    if miles_per_year < 7500:
        weighted_sum -= 5  # low mileage is good
    elif miles_per_year > 15000:
        weighted_sum += 10  # high mileage is bad

    return weighted_sum

# Apply to dataframe
df['Score'] = df.apply(enhanced_scoring, axis=1)

# Create training and testing data
df = df.dropna(subset=['Monthly Payment', 'Mileage', 'Price', 'Year', 'Score'])

X = df.drop(columns=['Score'])
y = df['Score']

# Feature selection (keep features with top 10% relevance = drop bottom 90%)
selector = SelectPercentile(f_regression, percentile=50)
X_selected = selector.fit_transform(X, y)

# Get selected feature names
selected_features = [X.columns[i] for i in selector.get_support(indices=True)]
print(f"Selected {len(selected_features)} features:")
print(selected_features)

# Update feature list
model_features = selected_features
with open('model_features.json', 'w') as f:
    json.dump(model_features, f)

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected)

joblib.dump(scaler, 'scaler.pkl')

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y.values.reshape(-1, 1), test_size=0.2, random_state=42)

# Save X_train values
X_train_df = pd.DataFrame(X_train, columns=model_features)
X_train_df.to_pickle("X_train.pkl")

# XGBoost Benchmarking
print("\nXGBoost Benchmark:")
xgb = XGBRegressor(n_estimators=1000, learning_rate=0.01, early_stopping_rounds=50, random_state=42)
xgb.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

xgb_pred = xgb.predict(X_test)
xgb_mse = mean_squared_error(y_test, xgb_pred)
xgb_r2 = r2_score(y_test, xgb_pred)
xgb_mae = mean_absolute_error(y_test, xgb_pred)

print(f"XGBoost MSE: {xgb_mse:.2f}")
print(f"XGBoost R²: {xgb_r2:.4f}")
print(f"XGBoost MAE: {xgb_mae:.2f}")

# Scale target
y_scaler = RobustScaler()
y_train_scaled = y_scaler.fit_transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

joblib.dump(y_scaler, 'model_scaler.pkl')

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_scaled.reshape(-1, 1), dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_scaled.reshape(-1, 1), dtype=torch.float32)

# Neural Network Implementation

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("\nUsing device:", device)

# Dataset Setup
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Model Definition
class ImprovedMLP(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Dropout(0.2),
            nn.Linear(64, 1)
        )
    
    def forward(self, x):
        return self.net(x)

# Training with Early Stopping
model = ImprovedMLP(X_train.shape[1]).to(device)
criterion = nn.SmoothL1Loss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

best_loss = float('inf')
patience = 100
no_improve = 0

for epoch in range(200):
    model.train()
    running_loss = 0.0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        running_loss += loss.item() * batch_X.size(0)
    
    epoch_loss = running_loss / len(train_loader.dataset)
    
    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            outputs = model(batch_X)
            val_loss += criterion(outputs, batch_y).item() * batch_X.size(0)
    
    val_loss /= len(test_loader.dataset)
    
    if val_loss < best_loss:
        best_loss = val_loss
        no_improve = 0
        torch.save(model.state_dict(), 'best_car_model.pth')

    else:
        no_improve += 1
        if no_improve >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break
    
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}: Train Loss: {epoch_loss:.4f}, Val Loss: {val_loss:.4f}")

# Load best model
model.load_state_dict(torch.load('best_car_model.pth'))
y_scaler = joblib.load('model_scaler.pkl')

# Evaluation
model.eval()
all_preds, all_targets = [], []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        outputs = model(batch_X)
        all_preds.append(outputs.cpu())
        all_targets.append(batch_y.cpu())

predictions = torch.cat(all_preds).numpy()
targets = torch.cat(all_targets).numpy()

preds_rescaled = y_scaler.inverse_transform(predictions)
y_true_rescaled = y_scaler.inverse_transform(targets)

print("All predictions (rescaled):")
print(preds_rescaled.flatten())


mse = mean_squared_error(y_true_rescaled, preds_rescaled)
r2 = r2_score(y_true_rescaled, preds_rescaled)
mae = mean_absolute_error(y_true_rescaled, preds_rescaled)

tolerance = 1
within_tolerance = np.abs(y_true_rescaled - preds_rescaled) <= tolerance
accuracy = np.mean(within_tolerance) * 100

print("\nFinal Evaluation:")
print(f"Test MSE: {mse:.2f}")
print(f"Test R²: {r2:.4f}")
print(f"Test MAE: {mae:.2f}")
print(f"Accuracy (within ±{tolerance}): {accuracy:.2f}%")

  from .autonotebook import tqdm as notebook_tqdm


Selected 1718 features:
['Mileage', 'Price', 'Monthly Payment', 'Year', 'Accidents', 'Owners', 'Usage', 'Age', 'MilesPerYear', 'PricePerMile', 'Condition_Certified', 'Condition_New', 'Condition_Other', 'Condition_Used', 'Model_Acura ILX 2.0', 'Model_Acura ILX Premium', 'Model_Acura ILX Technology & A-Spec Packages', 'Model_Acura Integra', 'Model_Acura Integra A-SPEC', 'Model_Acura Integra A-SPEC w/ Technology', 'Model_Acura Integra A-Spec', 'Model_Acura Integra A-Spec Package', 'Model_Acura Integra A-Spec Tech Package', 'Model_Acura Integra A-Spec Technology', 'Model_Acura Integra Base', 'Model_Acura Integra W/A-SPEC TECHNOLOGY PACKAGE', 'Model_Acura MDX 3.5L w/Technology Package', 'Model_Acura MDX A-SPEC', 'Model_Acura MDX A-Spec', 'Model_Acura MDX A-Spec Package', 'Model_Acura MDX Advance', 'Model_Acura MDX SH-AWD Technology', 'Model_Acura NSX Base', 'Model_Acura NSX T', 'Model_Acura RDX A-Spec', 'Model_Acura RDX A-Spec Advance', 'Model_Acura RDX A-Spec Package', 'Model_Acura RDX Bas

In [None]:
# Create the LLM input processor
from transformers import pipeline
import re
import json
import random

class CarScorePredictor:
    def __init__(self):
        
        self.input_scaler = joblib.load('scaler.pkl')
        self.output_scaler = joblib.load('model_scaler.pkl')

        # Load model features
        with open('model_features.json', 'r') as f:
            self.model_features = json.load(f)

        with open('all_make_model_keys.json', 'r') as f:
            self.model_slugs = json.load(f)
        
        self.model_types = [slug.replace('_', ' ').replace('-', ' ').title() for slug in self.model_slugs]

        # Get mean values for imputation from the training data
        self.X_train_df = pd.read_pickle("X_train.pkl")
        self.default_values = self.X_train_df.mean()

        # List of known categorical fields that were one-hot encoded
        self.one_hot_prefixes = ['Condition_', 'Model_', 'Dealer_']  
        
        # Initialize the neural network
        self.model = ImprovedMLP(input_size=len(self.model_features))  
        self.model.load_state_dict(torch.load('best_car_model.pth', weights_only=True))
        self.model.to(device)
        self.model.eval()
        
        # Initialize LLM for information extraction
        self.ner_pipeline = pipeline(
            "token-classification",
            model="dslim/bert-base-NER",
            aggregation_strategy="simple"
        )
        
        # Initialize LLM for text understanding
        self.qa_pipeline = pipeline(
            "question-answering",
            model="distilbert-base-uncased-distilled-squad"
        )

    def extract_car_info(self, text):
        """Extract structured car information from natural language"""
        # Extract entities using NER
        entities = self.ner_pipeline(text)

        # Initialize default values
        car_info = {
            'Year': None,
            'Model': None,
            'Mileage': None,
            'Price': None,
            'Condition': None,
            'Dealer': None,
            'Monthly Payment': None,
            'Accidents': 0,  # Default to no accidents
            'Owners': 1,     # Default to 1 owner
            'Usage': 1       # Default to personal use (1)
        }


        # Simple regex-based extractions for numerical fields
        mileage_match = re.search(r'\b([0-9,]+)\s*(mi|miles)\b\.?', text, re.IGNORECASE)
        price_match = re.search(r'\$([0-9,]+)\b(?![a-zA-Z/]| per)', text)
        payment_match = re.search(r'\$([0-9,]+)\s*(?:/mo|per month)', text, re.IGNORECASE)
        year_match = re.search(r'\b(20\d{2}|19\d{2})\b', text)

        if mileage_match:
            car_info['Mileage'] = int(mileage_match.group(1).replace(',', ''))
        if price_match:
            car_info['Price'] = int(price_match.group(1).replace(',', ''))
        if payment_match:
            car_info['Monthly Payment'] = int(payment_match.group(1).replace(',', ''))
        if year_match:
            car_info['Year'] = int(year_match.group(0))

        # Extract accident information
        if re.search(r'accidents?|accident history', text, re.IGNORECASE):
            car_info['Accidents'] = 1

        # Extract owner count
        owners_match = re.search(r'(\d+)\s*(owners?|previous owners?)', text, re.IGNORECASE)
        if owners_match:
            car_info['Owners'] = int(owners_match.group(1))
        elif re.search(r'one\s*owner|single owner', text, re.IGNORECASE):
            car_info['Owners'] = 1
        elif re.search(r'two\s*owners', text, re.IGNORECASE):
            car_info['Owners'] = 2
        elif re.search(r'three\s*owners', text, re.IGNORECASE):
            car_info['Owners'] = 3
        elif re.search(r'four\+?\s*owners', text, re.IGNORECASE):
            car_info['Owners'] = 4

        # Extract usage type (mapped to your binary encoding)
        if re.search(r'commercial|business|fleet', text, re.IGNORECASE):
            car_info['Usage'] = 0  # Not personal use

        # Pull entities from NER
        for ent in entities:
            label = ent['entity_group']
            word = ent['word']

            if label == 'ORG':
                car_info['Dealer'] = word
            elif label == 'MISC' or label == 'PRODUCT':
                car_info['Model'] = word
            elif label == 'DATE' and not car_info['Year']:
                try:
                    car_info['Year'] = int(re.search(r'\d{4}', word).group(0))
                except:
                    pass

        # Try to find a matching model from the list
        found_model = None
        for model in self.model_types:
            pattern = re.compile(rf'\b{re.escape(model)}\b', re.IGNORECASE)
            if pattern.search(text):
                found_model = model
                break

        if found_model:
            car_info['Model'] = found_model

        # Extract sales condition like New, Used, Certified, etc.
        condition_match = re.search(r'\b(New|Used|Certified(?: Pre-Owned)?)\b', text, re.IGNORECASE)
        if condition_match:
            car_info['Condition'] = condition_match.group(1).title()

        if not car_info['Dealer']:
            dealer_match = re.search(
                r'\b(?:at dealer|dealer:|at)\s+([A-Z][\w&.,\- ]{2,100})',
                text,
                re.IGNORECASE
            )
            if dealer_match:
                # Clean extra whitespace and strip trailing punctuation
                dealer_name = dealer_match.group(1).strip().rstrip('.,')
                dealer_name = re.sub(r'\bdealer\b', '', dealer_name, flags=re.IGNORECASE).strip()
                car_info['Dealer'] = dealer_name

        return car_info
    
    def prepare_features(self, car_info):
        """Convert extracted car info into the model input vector"""
        # Initialize feature row with zeros
        feature_vector = np.zeros(len(self.model_features))
        feature_df = pd.DataFrame([feature_vector], columns=self.model_features)

        # Fill in numerical fields
        for col in ['Year', 'Mileage', 'Price', 'Monthly Payment', 'Accidents', 'Owners']:
            val = car_info.get(col)
            if val is not None:
                feature_df.at[0, col] = val
            else:
                feature_df.at[0, col] = self.default_values[col]

        # Fill in one-hot fields
        for prefix in self.one_hot_prefixes:
            value = car_info.get(prefix)
            if value is not None:
                encoded = f"{prefix}_{value}"
                if encoded in self.model_features:
                    feature_df.at[0, encoded] = 1.0

        # Apply input scaling
        scaled_features = self.input_scaler.transform(feature_df.values)
        
        # Convert to tensor
        return torch.tensor(scaled_features, dtype=torch.float32).to(device)

    
    def predict_score(self, text_input):
        """Main prediction pipeline"""
        # Extract information from text
        car_info = self.extract_car_info(text_input)
        print("Extracted car info:", car_info)
        
        # Prepare features for model
        features_tensor = self.prepare_features(car_info)
        
        # Make prediction
        with torch.no_grad():
            prediction = self.model(features_tensor)
            score = self.output_scaler.inverse_transform(prediction.cpu().numpy().reshape(-1, 1))
        
        return score[0][0]

# Example usage
if __name__ == "__main__":
    predictor = CarScorePredictor()

    # Define options for different fields
    years = [str(y) for y in range(1998, 2023)]

    # Extract model names from the one-hot encoded features (removing "Model_" prefix)
    sample_models = [feature.replace("Model_", "") for feature in model_features if feature.startswith("Model_")]
    mileages = [f"{x:,}" for x in range(0, 250001, 5000)]
    prices = [f"${x:,}" for x in range(4000, 120001, 2000)]
    conditions = [feature.replace("Condition_", "") for feature in model_features if feature.startswith("Condition_")]
    dealers = [feature.replace("Dealer_", "") for feature in model_features if feature.startswith("Dealer_")]
    payments = [f"${x:,}" for x in range(120, 3001, 50)]
    
    # Options for additional features
    accident_options = ["no accidents", "accident reported", "clean history", "1 accident", "accident history"]
    owner_options = ["1 owner", "2 owners", "3 owners", "4+ owners", "single owner", "one previous owner"]
    usage_options = ["private use", "commercial use", "fleet use", "personal use", "business use"]

    # Create templates with varying levels of information
    basic_templates = [
        "{year} {model}, {mileage} miles, {price}, {condition} condition",
        "{model} from {year} with {mileage} miles, {price}",
        "Dealer: {dealer}. {year} {model}, {mileage} miles, {price}",
        "Looking at a {year} {model}, {condition} condition, {price}"
    ]
    
    intermediate_templates = [
        "{year} {model}, {mileage} miles, {price}, {condition} condition, {owner}",
        "{model} ({year}), {mileage} miles, {price}, {usage}, {accident}",
        "Found a {year} {model}, {condition} condition, {price}, {owner}, {accident}",
        "{year} {model}, {mileage} miles, {price}, {usage}, dealer: {dealer}"
    ]
    
    full_info_templates = [
        "{year} {model}, {mileage} miles, {price}, {condition} condition, {owner}, {accident}, {usage}",
        "Dealer: {dealer}. {year} {model}, {mileage} miles, {price}, {condition}, {owner}, {usage}, {accident}",
        "Complete info: {year} {model}, {mileage} miles, {price}, {condition}, {owner}, {accident}, {usage}, dealer: {dealer}"
    ]

    # Combine all templates
    all_templates = basic_templates + intermediate_templates + full_info_templates

    # Generate 100 diverse test queries with varying information completeness
    test_queries = []
    for _ in range(100):
        template = random.choice(all_templates)
        
        # For basic templates, don't include all optional features
        if template in basic_templates:
            query = template.format(
                year=random.choice(years),
                model=random.choice(sample_models),
                mileage=random.choice(mileages),
                price=random.choice(prices),
                condition=random.choice(conditions),
                dealer=random.choice(dealers)
            )
        else:
            query = template.format(
                year=random.choice(years),
                model=random.choice(sample_models),
                mileage=random.choice(mileages),
                price=random.choice(prices),
                condition=random.choice(conditions),
                dealer=random.choice(dealers),
                accident=random.choice(accident_options),
                owner=random.choice(owner_options),
                usage=random.choice(usage_options)
            )
        test_queries.append(query)

    # Add some manual examples that cover edge cases
    test_queries.extend([
        "2021 Toyota Camry with 35,000 miles for $25,000 in used condition from AutoNation, no accidents, 1 owner",
        "2019 Honda Accord, 75,000 miles, $18,500, used condition, personal use",
        "Tesla Model 3 2022 with 12,000 miles priced at $42,000, clean history",
        "Ford F-150 2017, 120,000 miles, $22,999, used condition, commercial use, accident reported",
        "2018 Nissan Rogue, $16,500, 85,000 miles, used condition, 3 owners",
        "2018 Subaru Baja, $500 per month, 2,000 miles, used condition, 1 owner",
        "2022 Toyota Camry with 12,000 miles for $9,000 in used condition, personal use, 1 owner, clean history"
    ])
    
    min_score = 1000 # Will store smallest tuple query
    min_query = ""

    # Test the predictor with sample queries
    for i, query in enumerate(test_queries):  # Just test first 10 for demonstration
        try:
            score = predictor.predict_score(query)

            if (score < min_score) and (query != ""):
                min_score = score
                min_query = query
            
            print(f"Query {i+1}: {query}")
            print(f"Predicted Score: {score:.2f}\n")
        except Exception as e:
            print(f"Error processing query: {query}")
            print(f"Error: {str(e)}\n")
    
    print(f"Min Query: {min_query}")
    print(f"Min Score: {min_score:.2f}\n")

Extracted car info: {'Year': 2015, 'Model': None, 'Mileage': 150000, 'Price': 48000, 'Condition': 'New', 'Dealer': 'Honda', 'Monthly Payment': None, 'Accidents': 0, 'Owners': 1, 'Usage': 0}
Query 1: Dealer: Star Motor Sales. 2015 Honda HR-V LX, 150,000 miles, $48,000, New, 1 owner, business use, clean history
Predicted Score: 52.90

Extracted car info: {'Year': 2011, 'Model': 'Audi Q7', 'Mileage': 150000, 'Price': 10000, 'Condition': 'New', 'Dealer': None, 'Monthly Payment': None, 'Accidents': 0, 'Owners': 1, 'Usage': 1}
Query 2: 2011 Audi Q7 3.0T Premium, 150,000 miles, $10,000, New condition
Predicted Score: 42.22

Extracted car info: {'Year': 2022, 'Model': 'Lexus Ls 500', 'Mileage': 30000, 'Price': 18000, 'Condition': 'Certified', 'Dealer': None, 'Monthly Payment': None, 'Accidents': 0, 'Owners': 1, 'Usage': 1}
Query 3: 2022 Lexus LS 500 Base, 30,000 miles, $18,000, Certified condition, one previous owner
Predicted Score: 39.70

Extracted car info: {'Year': 2006, 'Model': 'Audi Rs 