## Install Deps

In [None]:
# %conda install pandas tqdm
%conda install pandas

Channels:
 - defaults
Platform: linux-aarch64
Collecting package metadata (repodata.json): done
Solving environment: done


    current version: 25.3.0
    latest version: 25.3.1

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /workspaces/parallel-corpora2/.conda

  removed specs:
    - tqdm


The following packages will be REMOVED:

  tqdm-4.67.1-py312h42ac6d5_0



Downloading and Extracting Packages:

Preparing transaction: done
Verifying transaction: done
Executing transaction: done

Note: you may need to restart the kernel to use updated packages.


In [None]:
# install google-cloud-vision 
# (pip magic installs in the current conda env)
# (this is a workaround for the fact that google-cloud-vision is not available in conda)

%pip install google-cloud-vision

Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.1
Note: you may need to restart the kernel to use updated packages.


## Setup Paths

In [2]:
ocr_text_name = 'astangahridaya_vol1'

local_input = f"data/ocr/src/{ocr_text_name}/{ocr_text_name}.pdf"
local_inter = f"data/ocr/inter/{ocr_text_name}/"
local_output = f"data/ocr/out/{ocr_text_name}/{ocr_text_name}.md"

gcs_input = f"gs://iiith-ska-satamt/ocr/src/{ocr_text_name}/{ocr_text_name}.pdf"
gcs_output = f"gs://iiith-ska-satamt/ocr/out/{ocr_text_name}/"

## Copy src file to GCS

In [22]:
!gcloud storage cp {local_input} {gcs_input}

Copying file://data/ocr/src/astangasangraha/astangasangraha.pdf to gs://iiith-ska-satamt/ocr/src/astangasangraha/astangasangraha.pdf
  Completed files 1/1 | 62.2MiB/62.2MiB | 386.2kiB/s                           

Average throughput: 535.9kiB/s


In [23]:
# !gcloud ml vision detect-text-pdf gs://iiith-ska-satamt/src/{ocr_text_name}/{ocr_text_name}.pdf  gs://iiith-ska-satamt/ocr/out/{ocr_text_name}/

## OCR the src file

In [None]:
from google.cloud import vision_v1 as vision
import time
import sys

def async_detect_document(gcs_source_uri, gcs_destination_uri):
    client = vision.ImageAnnotatorClient()

    mime_type = 'application/pdf'
    batch_size = 20

    feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)
    gcs_source = vision.GcsSource(uri=gcs_source_uri)
    input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type)

    gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.OutputConfig(gcs_destination=gcs_destination, batch_size=batch_size)

    async_request = vision.AsyncAnnotateFileRequest(
        features=[feature],
        input_config=input_config,
        output_config=output_config,
    )

    operation = client.async_batch_annotate_files(requests=[async_request])

    print(f"📤 Operation started: {operation.operation.name}")

    # Set a max wait time (e.g., 15 minutes = 900 seconds)
    timeout_secs = 900
    poll_interval = 2  # seconds
    elapsed = 0
    start = time.time()

    while not operation.done():
        elapsed = time.time() - start
        if elapsed > timeout_secs:
            print("\n⏰ Request timed out.")
            break
        sys.stdout.write(f"\r⏳ Waiting for operation to complete... {elapsed:.2f}s elapsed")
        sys.stdout.flush()
        time.sleep(poll_interval)

    print("\n✅ Operation completed.")
    return operation.result()


async_detect_document(gcs_input, gcs_output)

📤 Operation started: projects/iiith-docs-ocr/operations/f494bdf786b2d2b7
⏳ Waiting for operation to complete... 115.96s elapsed
✅ Operation completed.


responses {
  output_config {
    gcs_destination {
      uri: "gs://iiith-ska-satamt/ocr/out/astangasangraha/"
    }
    batch_size: 1
  }
}

## Download OCR results

In [25]:
!mkdir -p {local_inter}
!gcloud storage cp -r '{gcs_output}**' {local_inter}

Copying gs://iiith-ska-satamt/ocr/out/astangasangraha/output-1-to-1.json to file://data/ocr/inter/astangasangraha/output-1-to-1.json
Copying gs://iiith-ska-satamt/ocr/out/astangasangraha/output-10-to-10.json to file://data/ocr/inter/astangasangraha/output-10-to-10.json
Copying gs://iiith-ska-satamt/ocr/out/astangasangraha/output-100-to-100.json to file://data/ocr/inter/astangasangraha/output-100-to-100.json
Copying gs://iiith-ska-satamt/ocr/out/astangasangraha/output-101-to-101.json to file://data/ocr/inter/astangasangraha/output-101-to-101.json
Copying gs://iiith-ska-satamt/ocr/out/astangasangraha/output-102-to-102.json to file://data/ocr/inter/astangasangraha/output-102-to-102.json
Copying gs://iiith-ska-satamt/ocr/out/astangasangraha/output-103-to-103.json to file://data/ocr/inter/astangasangraha/output-103-to-103.json
Copying gs://iiith-ska-satamt/ocr/out/astangasangraha/output-104-to-104.json to file://data/ocr/inter/astangasangraha/output-104-to-104.json
Copying gs://iiith-ska-sa

## Convert results to MD

In [3]:
# prompt: read this file gcv_test_sample1_outputoutput-1-to-20.json
# read 'responses' field. for each entry in reponses
# get fullTextAnnotation.text, context.uri, context.pageNumber
# write to a markdown file with the following format:
# ## Page: <page_number>
# ```
# <fullTextAnnotation.text>
# ```
# and separate each page with a line
# ---

import os
import json
import re
from datetime import datetime


def natural_key(s):
    return [int(text) if text.isdigit() else text for text in re.split(r'(\d+)', s)]

def process_json_file(filepath):
    try:
        with open(filepath, 'r') as f:
            data = json.load(f)
    except FileNotFoundError:
        print(f"Error: File not found at {filepath}")
        return None
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in {filepath}")
        return None

    responses = data.get('responses', [])
    if not responses:
        print("Warning: 'responses' field not found or empty in JSON data.")
        return None

    extracted_data = []
    for response in responses:
        full_text_annotation = response.get('fullTextAnnotation', {})
        text = full_text_annotation.get('text', '')
        context = response.get('context', {})
        uri = context.get('uri', '')
        page_number = context.get('pageNumber', '')
        extracted_data.append({'text': text, 'uri': uri, 'page_number': page_number})

    return extracted_data

    # df = pd.DataFrame(extracted_data)
    # return df

def process_and_write_markdown(src_dir, output_file):
    # Sort files based on natural order
    files = sorted([f for f in os.listdir(src_dir) if f.endswith(".json")], key=natural_key)

    # Get the timestamp
    timestamp = datetime.now().isoformat()

    # Create the output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    # Write to the markdown file
    with open(output_file, 'w') as md_file:
        md_file.write(f"# {ocr_text_name} \n")

        # Front matter
        md_file.write(f"---\n")
        md_file.write(f"ocr_text_name: {ocr_text_name}\n")
        md_file.write(f"local_input: {local_input}\n")
        md_file.write(f"local_output: {local_output}\n")
        md_file.write(f"local_inter: {local_inter}\n")
        md_file.write(f"gcs_output: {gcs_output}\n")
        md_file.write(f"gcs_input: {gcs_input}\n")
        md_file.write(f"ocr_timestamp: {timestamp}\n")
        md_file.write(f"---\n")

        for filename in files:
            if filename.endswith('.json'):
                filepath = os.path.join(src_dir, filename)
                data = process_json_file(filepath)
                if data is None:
                    print(f"Failed to process file: {filename}")
                    continue

                print(f"Processing file: {filename}")

                for entry in data:
                    page_number = entry.get('page_number')
                    text = entry.get('text')

                    if not text:
                        print(f"Warning: 'text' field is empty in entry from {filename} on page {page_number}.")
                        continue

                    md_file.write(f"\n## Page: {page_number}\n\n")
                    md_file.write("```")
                    md_file.write(f"\n{text}\n")
                    md_file.write("```")
                    md_file.write("\n---\n")

# Call the function with the appropriate arguments
process_and_write_markdown(local_inter, local_output)



Processing file: output-1-to-20.json
Processing file: output-21-to-40.json
Processing file: output-41-to-60.json
Processing file: output-61-to-80.json
Processing file: output-81-to-100.json
Processing file: output-101-to-120.json
Processing file: output-121-to-140.json
Processing file: output-141-to-160.json
Processing file: output-161-to-180.json
Processing file: output-181-to-200.json
Processing file: output-201-to-220.json
Processing file: output-221-to-240.json
Processing file: output-241-to-260.json
Processing file: output-261-to-280.json
Processing file: output-281-to-300.json
Processing file: output-301-to-320.json
Processing file: output-321-to-340.json
Processing file: output-341-to-360.json
Processing file: output-361-to-380.json
Processing file: output-381-to-400.json
Processing file: output-401-to-420.json
Processing file: output-421-to-440.json
Processing file: output-441-to-460.json
Processing file: output-461-to-480.json
Processing file: output-481-to-500.json
Processing