In [1]:
import os
from dotenv import load_dotenv
from typing import List, Optional
from openai import OpenAI
from datasets import load_dataset
from datetime import date, time, datetime, timedelta
from dataclasses import dataclass

In [2]:
import json
from transformers import AutoModelForCausalLM

In [2]:
!pip install openai

Defaulting to user installation because normal site-packages is not writeable
Collecting openai
  Obtaining dependency information for openai from https://files.pythonhosted.org/packages/24/cf/1fbe3eaf1d90796722c3dcd268e103a6a0918f25ec51138c7d27aec7f001/openai-1.55.1-py3-none-any.whl.metadata
  Downloading openai-1.55.1-py3-none-any.whl.metadata (24 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Obtaining dependency information for distro<2,>=1.7.0 from https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl.metadata
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Obtaining dependency information for httpx<1,>=0.23.0 from https://files.pythonhosted.org/packages/56/95/9377bcb415797e44274b51d46e3249eba641711cf3348050f76ee7b15ffc/httpx-0.27.2-py3-none-any.whl.metadata
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0

In [3]:
from datasets import load_dataset

ds = load_dataset("naver-clova-ix/cord-v1")

In [7]:
print(ds)

DatasetDict({
    train: Dataset({
        features: ['image', 'ground_truth'],
        num_rows: 800
    })
    validation: Dataset({
        features: ['image', 'ground_truth'],
        num_rows: 100
    })
    test: Dataset({
        features: ['image', 'ground_truth'],
        num_rows: 100
    })
})


In [18]:
print(ds['train'][0]['ground_truth'])

{"gt_parse": {"menu": [{"nm": "Nasi Campur Bali", "cnt": "x 1", "price": "75,000"}, {"nm": "Bbk Bengil Nasi", "cnt": "x 1", "price": "125,000"}, {"nm": "MilkShake Starwb", "cnt": "x 1", "price": "37,000"}, {"nm": "Ice Lemon Tea", "cnt": "x 1", "price": "24,000"}, {"nm": "Nasi Ayam Dewata", "cnt": "x 1", "price": "70,000"}, {"nm": "Free Ice Tea", "cnt": "x 3", "price": "0"}, {"nm": "Organic Green Sa", "cnt": "x 1", "price": "65,000"}, {"nm": "Ice Tea", "cnt": "x 1", "price": "18,000"}, {"nm": "Ice Orange", "cnt": "x 1", "price": "29,000"}, {"nm": "Ayam Suir Bali", "cnt": "x 1", "price": "85,000"}, {"nm": "Tahu Goreng", "cnt": "x 2", "price": "36,000"}, {"nm": "Tempe Goreng", "cnt": "x 2", "price": "36,000"}, {"nm": "Tahu Telor Asin", "cnt": "x 1", "price": "40,000."}, {"nm": "Nasi Goreng Samb", "cnt": "x 1", "price": "70,000"}, {"nm": "Bbk Panggang Sam", "cnt": "x 3", "price": "366,000"}, {"nm": "Ayam Sambal Hija", "cnt": "x 1", "price": "92,000"}, {"nm": "Hot Tea", "cnt": "x 2", "price

In [11]:
print(ds['train']['ground_truth'][0])

{


In [4]:
load_dotenv('vars.env', override=True)
BASE_URL = os.getenv("CS4973_BASE_URL")
API_KEY = os.getenv("CS4973_API_KEY")

client = OpenAI(base_url=BASE_URL, api_key=API_KEY)

In [21]:
from huggingface_hub import login


# Option 2: Directly pass your token (replace 'YOUR_TOKEN' with your actual token)
login(token='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')

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /u/nshiroglazova/.cache/huggingface/token
Login successful


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move model to GPU
model.to(device)

In [27]:
import json

def extract_relevant_info(ground_truth):
    """
    Extract relevant information from receipt's ground truth.
    """
    # Parse ground_truth if it's a string
    if isinstance(ground_truth, str):
        ground_truth = json.loads(ground_truth)
    
    # Extract relevant fields
    gt_parse = ground_truth.get("gt_parse", {})
    menu_items = gt_parse.get("menu", [])
    subtotal = gt_parse.get("sub_total", {})
    total = gt_parse.get("total", {})

    # Extract items, quantities, and prices
    items_info = []
    for item in menu_items:
        name = item.get("nm", "Unknown Item")
        quantity = item.get("cnt", "Unknown Quantity")
        price = item.get("price", "Unknown Price")
        items_info.append(f"Item: {name}, Quantity: {quantity}, Price: {price}")

    # Extract subtotal and total
    subtotal_price = subtotal.get("subtotal_price", "Unknown")
    tax_price = subtotal.get("tax_price", "Unknown")
    service_price = subtotal.get("service_price", "Unknown")
    total_price = total.get("total_price", "Unknown")

    # Format the information into a single string
    info = "\n".join(items_info)
    info += f"\nSubtotal: {subtotal_price}, Tax: {tax_price}, Service Charge: {service_price}, Total: {total_price}"
    return info


In [15]:
def generate_single_qa_pair(receipt_info, client, previous_questions, max_content_length=1500):
    """
    Generate a single unique question-answer pair for receipt information.
    """
    # Trim the content if it exceeds the maximum length
    trimmed_content = receipt_info[:max_content_length]

    # Construct the prompt including previous questions to avoid duplication
    previous_questions_text = "\n".join(f"- {q}" for q in previous_questions)
    prompt = (
        f"You are an assistant specializing in analyzing receipts and generating insightful questions and accurate answers about them. "
        f"Here is information extracted from a receipt:\n{trimmed_content}\n\n"
        "### Guidelines for QA Generation\n"
        "- Create a question that is relevant to the provided receipt information.\n"
        "- Questions can reflect common customer concerns, such as pricing, quantity discrepancies, tax calculations, or general inquiries about the receipt.\n"
        "- Include a mix of question types across multiple queries (e.g., direct factual questions, reasoning-based questions, clarifications, or summary questions).\n"
        "- Ensure the answer is accurate and aligns with the receipt data provided.\n"
        "- Avoid duplication of previous questions when multiple questions are generated for the same receipt.\n\n"
        "### Previous Questions (do not duplicate these):\n"
        f"{previous_questions_text}\n\n"
        "### Generate exactly **one** question-answer pair in the following format:\n"
        "**Q:** [Generated question based on the receipt information]\n"
        "**A:** [Accurate answer derived from or inferred based on the receipt information]\n"
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generate response on GPU
    outputs = model.generate(
        **inputs,
        max_length=512,
        num_return_sequences=1,
        temperature=0.7
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()


In [16]:
dataset = load_dataset("naver-clova-ix/cord-v1")


qa_pairs = []
for example in dataset['train']:
    ground_truth = example['ground_truth']
    image_name = example['image']  # Assuming the image name is stored here
    print("Ran 1")
    # Extract receipt information
    receipt_info = extract_relevant_info(ground_truth)
    print("Ran 2")
    if not receipt_info.strip():  # Skip empty or invalid content
        continue

    previous_questions = []  # Keep track of the last 3 questions generated

    # Generate 2 question-answer pairs for each receipt
    for _ in range(2):
        print("Ran 3")
        qa_text = generate_single_qa_pair(receipt_info, client, previous_questions)

        if 'A:' in qa_text:
            question, answer = qa_text.split('A:', 1)
            question = question.strip()
            answer = answer.strip()
            print(question)

            qa_pairs.append({
                "question": question,
                "answer": answer,
                "receipt_info": receipt_info,
                "image_name": image_name  # Include the image name
            })

            # Update the list of previous questions (limit to the last 3)
            previous_questions.append(question)
            if len(previous_questions) > 3:
                previous_questions.pop(0)

Ran 1
Ran 2
Ran 3
generating
Item: Nasi Campur Bali, Quantity: x 1, Price: 75,000
Item: Bbk Bengil Nasi, Quantity: x 1, Price: 125,000
Item: MilkShake Starwb, Quantity: x 1, Price: 37,000
Item: Ice Lemon Tea, Quantity: x 1, Price: 24,000
Item: Nasi Ayam Dewata, Quantity: x 1, Price: 70,000
Item: Free Ice Tea, Quantity: x 3, Price: 0
Item: Organic Green Sa, Quantity: x 1, Price: 65,000
Item: Ice Tea, Quantity: x 1, Price: 18,000
Item: Ice Orange, Quantity: x 1, Price: 29,000
Item: Ayam Suir Bali, Quantity: x 1, Price: 85,000
Item: Tahu Goreng, Quantity: x 2, Price: 36,000
Item: Tempe Goreng, Quantity: x 2, Price: 36,000
Item: Tahu Telor Asin, Quantity: x 1, Price: 40,000.
Item: Nasi Goreng Samb, Quantity: x 1, Price: 70,000
Item: Bbk Panggang Sam, Quantity: x 3, Price: 366,000
Item: Ayam Sambal Hija, Quantity: x 1, Price: 92,000
Item: Hot Tea, Quantity: x 2, Price: 44,000
Item: Ice Kopi, Quantity: x 1, Price: 32,000
Item: Tahu Telor Asin, Quantity: x 1, Price: 40,000
Item: Free Ice Tea,

KeyboardInterrupt: 

In [None]:
output_path = 'qa_dataset.json'
with open(output_path, 'w') as output_file:
    json.dump(qa_pairs, output_file, indent=4)


print(f"QA dataset saved at: {output_path}")
