In [None]:
import pandas as pd
import numpy as np

#from utils.dataset import capture_screenshots_async
from utils.dataset import capture_screenshots_async

In [None]:
data = pd.read_excel("./data/Price_Normalization_Dataset.xlsx")

In [None]:
dataset = await capture_screenshots_async(data)

cols_to_fix = ["Explictness","Step1_desired_unit", "Step1_desired_value"]

dataset[cols_to_fix] = dataset[cols_to_fix].where(
    pd.notna(dataset[cols_to_fix]),
    None
)


In [None]:
dataset

In [None]:
dataset.to_excel("./dataset/dataset.xlsx")

### Pydantic model and LLM inference

In [1]:
#pydantic models

#output

from pydantic import BaseModel
from typing import Optional, Literal, List


class Evidence(BaseModel):
    source: Literal["price_label", "specification_table", "more_information","calculator box"]
    text: str


class Quantity(BaseModel):
    value: float
    unit: str


class Step1Output(BaseModel):
    priced_quantity: Optional[Quantity]
    explicitness: Literal["direct", "indirect", "none"]
    confidence: float
    evidence: List[Evidence]
    notes: str


In [None]:
SYSTEM_PROMPT = """You are a pricing quantity inference engine.

Your task is to identify the quantity that the listed price applies to,
using ONLY explicit seller-visible information from the webpage.

Allowed evidence:
- Price labels or text directly adjacent to the price
- Specification tables
- Explicit product labels stating quantity or unit

Do NOT:
- use comments
- infer from typical product sizes
- assume standard lengths
- perform unit conversion
- perform calculations
- if unit is not visible do not infer or assume any unit
- treat cart quantity selectors, add-to-cart counters, or default quantity values (e.g. "Quantity: 1") as a pricing unit.

Do:
- Unit must be in full from
- Convert millimeter to meters
- If unit is not visible directlty adjacent to main price label set explicitness to "indirect"
- If no standard unit (length, weight, area, volume, pack size) is explicitly stated,
set explicitness to "none".

Return ONLY a JSON object that matches the provided schema.
Billing accuracy is required."""


USER_PROMPT = 'screenshot : '

In [3]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [24]:
import base64
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_openai import ChatOpenAI
from openai import OpenAIError

# Initialize model
model = ChatOpenAI(model="gpt-4o-mini")
structure_model = model.with_structured_output(Step1Output)


image_path = "/Users/yaseen/Desktop/Gordian Agentic Web Scrapper/screenshots/case_0.png"


async def infer_from_webpage(image_path : str) -> Step1Output:

    try:
        with open(image_path, "rb") as image_file:
            encoded_image = base64.b64encode(image_file.read()).decode("utf-8")

            human_msg = HumanMessage([
                    {"type": "text", "text": USER_PROMPT},
                    {
                        "type": "image",
                        "base64": encoded_image,
                        "mime_type": "image/png",
                    },
                ])
            
            system_msg = SystemMessage(SYSTEM_PROMPT)

            response = await structure_model.ainvoke([system_msg,human_msg])

            return response
        
    except OpenAIError as e:
        raise RuntimeError(f"OpenAI API error: {e}") from e

    except Exception as e:
        raise RuntimeError(f"Inference failed: {e}") from e


In [None]:
# test infer_from_webpage
response = await infer_from_webpage(image_path)
response

In [20]:
import asyncio
import pandas as pd
async def run_step1_on_dataset(
    df: pd.DataFrame,
    semaphore_limit: int = 5,
) -> pd.DataFrame:
    

    df = df.copy()
    df["Output"] = None

    semaphore = asyncio.Semaphore(semaphore_limit)


    async def run_one(index, row):
        async with semaphore:
            try:
                output: Step1Output = await infer_from_webpage(
                    image_path=row["img_path"],
                )
                return index, output.model_dump_json()

            except Exception as e:
                # Store error as string (important for dataset audit)
                return index, f"ERROR: {str(e)}"

    tasks = [
        run_one(idx, row)
        for idx, row in df.iterrows()
    ]

    results = await asyncio.gather(*tasks)

    for idx, result in results:
        df.at[idx, "Output"] = result

    return df

In [25]:
# test sample
#sample = dataset.iloc[[1],:]

dataset = pd.read_excel("./dataset/dataset.xlsx")

result_df = await run_step1_on_dataset(dataset[-2:])

In [26]:
result_df.to_excel('./result/result5.xlsx')

# prompt improvment
1. convert mm to m
2. include calculator box / custome calculator table in evidenece source
3. concrete the 'none' case, examples of this cases ouptut assume some unit ; it should not be

## Evaluation

In [None]:
# later required

from langchain_core.prompts import PromptTemplate

STEP1_TEXT_PROMPT = PromptTemplate(
    input_variables=["extracted_price", "product_description"],
    template=(
        """Extracted price: {extracted_price}
        Product description: {product_description}
        screenshot : """
    )
)


In [None]:
STEP1_TEXT_PROMPT.format(extracted_price=5,product_description="hello")