## Installing Dependencies

In [1]:
!pip install pdf2image

Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0


In [2]:
!apt install poppler-utils

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 29 not upgraded.
Need to get 186 kB of archives.
After this operation, 696 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.6 [186 kB]
Fetched 186 kB in 1s (266 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 124947 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.6_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.6) ...
Setting up poppler-utils (22.02.0-2ubuntu0.6) ...
Processing triggers for man-db (2.10.2-1) ...


## Global Variables

In [3]:
import json
from google import genai
from pdf2image import convert_from_path
import PIL
import pandas as pd

In [4]:
from google.colab import userdata

api_key = userdata.get('GOOGLE_API_KEY')

In [5]:
client = genai.Client(api_key=api_key)

## Utility Functions

In [6]:
def convert_str_to_json(str):
  return json.loads(str.replace("```json", "").replace("```", ""))

def save_json(data, path):
  with open(path, 'w') as f:
    json.dump(data, f)

  return path

In [7]:
def json_to_table(data):
  df = pd.DataFrame(data)
  return df

## Prompts

In [8]:
extraction_prompt = """
You are given an image of an invoice. Extract as much information as you can infer from the invoice and return the content of the invoice in the following json format:

{
  "table_content": [
    {
      <column_name>: "<column_value>",
      ...
    }
  ],
  "buyer_details" : {
    <buyer_detail_name>: "<buyer_detail_value>",
    ...
  },
  "seller_details" : {
    <seller_detail_name>: "<seller_detail_value>",
    ...
  },
  "invoice_details" : {
    <invoice_detail_name>: "<invoice_detail_value>",
    ...
  }
}
"""

In [60]:
validation_prompt = """
You are given an image of an invoice and the details extracted from it. Evaluate the extraction quality based on the following criteria:
1) Word Error Rate (WER)
2) Character Error Rate (CER)

**Step 1:** Compute WER using Python code internally:
   WER = (Number of word errors) / (Total words in reference text)

**Step 2:** Compute CER using Python code internally:
   CER = (Number of character errors) / (Total characters in reference text)

Do not output any explanations, intermediate calculations, or code. **Only output WER and CER as two numerical values in the json format:**
WER: <value>
CER: <value>


Extracted Invoice Details: {}
"""

## Data Extraction

In [48]:
class DataExtractor:
  def __init__(self, file_save_path="/content/"):
    self.file_save_path = file_save_path

  def __call__(self, path, prompt):
    file_type = self._check_file_type(path)

    if file_type == 'pdf':
      image_path = self._convert_pdf_to_images(path, self.file_save_path)

    else:
      image_path = path

    img = PIL.Image.open(image_path)

    response = client.models.generate_content(
        model = "gemini-2.0-flash",
        contents=[prompt, img],
        )

    return response.text

  def _check_file_type(self, path):
    if path.endswith('.pdf'):
      return 'pdf'

    elif path.endswith('.jpg') or path.endswith('.png'):
      return 'image'

    else:
      raise ValueError('Unsupported file type')

  def _convert_pdf_to_images(self, pdf_path, image_path, return_page=0):
    pdf = convert_from_path(pdf_path)

    for i, page in enumerate(pdf):
      page.save(f'{image_path}/page_{i}.png', 'PNG')

    return f'{image_path}/page_{return_page}.png'


agent = DataExtractor()

In [49]:
data = agent(
    path = "/content/drive/MyDrive/Bryckel AI/invoices.pdf",
    prompt = extraction_prompt
    )

refined = convert_str_to_json(data)

In [50]:
display(refined)

save_json(refined, "/content/invoice_data.json")

{'table_content': [{'Description': '[1356578] WLY602040 400 mAh 3.7V single cell Rechargeable LiPo Battery',
   'HSN': '85076000',
   'Rate': '160.170000',
   'Qty': '4.00',
   'Disc': '0.00',
   'Amount': '640.68',
   'IGST': '115.32 (18.0%)',
   'Total': '756.00'},
  {'Description': '[44255] 3PI miniQ Car wheel Tyre 44mm N20 DC Gear Motor Wheel',
   'HSN': '84799090',
   'Rate': '43.220000',
   'Qty': '4.00',
   'Disc': '0.00',
   'Amount': '172.88',
   'IGST': '31.12 (18.0%)',
   'Total': '204.00'},
  {'Description': '[476675] N20 6V 150RPM Micro Metal Gear Motor With Encoder',
   'HSN': '85013111',
   'Rate': '397.457500',
   'Qty': '8.00',
   'Disc': '0.00',
   'Amount': '3.179.66',
   'IGST': '572.34 (18.0%)',
   'Total': '3,752)'},
  {'Description': '[std_shipping] STANDARD SHIPPING',
   'HSN': '996819',
   'Rate': '0.000000',
   'Qty': '1.00',
   'Disc': '0.00',
   'Amount': '0.00',
   'IGST': '0.00 (18.0%)',
   'Total': '0.00'}],
 'buyer_details': {'Customer_Name': 'Vansh Sach

'/content/invoice_data.json'

In [51]:
json_to_table(refined['table_content'])

Unnamed: 0,Description,HSN,Rate,Qty,Disc,Amount,IGST,Total
0,[1356578] WLY602040 400 mAh 3.7V single cell R...,85076000,160.17,4.0,0.0,640.68,115.32 (18.0%),756.00
1,[44255] 3PI miniQ Car wheel Tyre 44mm N20 DC G...,84799090,43.22,4.0,0.0,172.88,31.12 (18.0%),204.00
2,[476675] N20 6V 150RPM Micro Metal Gear Motor ...,85013111,397.4575,8.0,0.0,3.179.66,572.34 (18.0%),"3,752)"
3,[std_shipping] STANDARD SHIPPING,996819,0.0,1.0,0.0,0.00,0.00 (18.0%),0.00


## Confidence Score Calculation

In [86]:
weights = {
    "WER": 0.8,
    "CER": 0.3
}

def calculate_score(scores):
  weighted_score = (scores['WER'] * weights['WER']) + (scores['CER'] * weights['CER'])
  return 1 - weighted_score / 2

In [87]:
scores = agent(
    path = "/content/page_0.png",
    prompt = validation_prompt.format(data)
    )

scores = convert_str_to_json(scores)

In [89]:
scores

{'WER': 0.016632495821727024, 'CER': 0.003264462809917355}

In [88]:
calculate_score(scores)

0.9928573322498215