In [0]:
%sh 
apt update
apt-get install -y poppler-utils

In [0]:
%pip install -r requirements.txt
%restart_python

In [0]:
dbutils.widgets.text("config", "nuextract_config", "Configuration File")

In [0]:
import yaml
import os

config_path = dbutils.widgets.get("config")

with open(f"../configs/{config_path}.yaml", "r") as f:
    config = yaml.safe_load(f)

catalog_name = config.get("catalog_name")
schema_name = config.get("schema_name")
volume_name = config.get("volume_name")
volume_folder = config.get("volume_folder")
model_name = config.get("model_name")
revision = config.get("revision")
secret_scope = config.get("secret_scope")
secret_key = config.get("secret_key")
quantization_type = config.get("quantization_type")

cache_volume =  f"/Volumes/{catalog_name}/{schema_name}/{volume_name}/{model_name}/{revision}/{volume_folder}"
cache_hf = "/local_disk0/hf_cache"
cache_local = f"/local_disk0/{volume_folder}" 

os.environ["HF_HOME"] = cache_hf
os.environ["HF_HUB_CACHE"] = cache_hf
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "True"
os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "1000"
# os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = '1'  # Enables optimized download backend

print(f"Loaded Config: {config}")

In [0]:
import requests
from io import BytesIO
from pdf2image import convert_from_bytes

url = "https://arxiv.org/pdf/2502.13923"
response = requests.get(url)
pdf_bytes = response.content

# NOTE: Use files within the Volumes directory by reading with normal python file I/O
# with open("/Volumes/users/will_smith/test_pdfs/2502.13923v1.pdf", "rb") as f:
#     pdf_bytes = f.read()
    
pil_images = convert_from_bytes(pdf_bytes)

### Let's see the image that we are going to be extracting from

In [0]:
pil_images[0]

# Inference: Generation of the output


In [0]:
import shutil
import os

# Copy volume cache to local cache if not already there
if not os.path.exists(cache_local):
    try: 
        print(f"Loading model from {cache_volume} to {cache_local}.")
        snapshots_dir = '/'.join(cache_local.split('/')[:-1])
        if not os.path.exists(snapshots_dir):
            os.makedirs(snapshots_dir)
        
        shutil.copytree(cache_volume, cache_local) 
        print(f"Successfully loaded model from {cache_volume} to {cache_local}!")
    except Exception as e: 
        print(f"Error: {e}")
else:
    print(f"File already exists locally at {cache_local}")

In [0]:
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq

processor = AutoProcessor.from_pretrained(cache_local)

model = AutoModelForVision2Seq.from_pretrained(
  cache_local,
  torch_dtype=torch.bfloat16,
  device_map="auto"
)

In [0]:
%pip install qwen-vl-utils[decord]==0.0.8

In [0]:
template = """{"extracted_text": "verbatim-string"}"""
document = {"type": "image", "image": pil_images[0]}

messages = [{"role": "user", "content": [document]}]
text = processor.tokenizer.apply_chat_template(
    messages,
    template=template,
    tokenize=False,
    add_generation_prompt=True,
)

In [0]:

from inference_functions import process_all_vision_info_for_nuextract

image_inputs = process_all_vision_info_for_nuextract(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    padding=True,
    return_tensors="pt",
).to("cuda")

In [0]:

generation_config = {"do_sample": False, "num_beams": 1, "max_new_tokens": 2048}

# Inference: Generation of the output
generated_ids = model.generate(
    **inputs,
    **generation_config
)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]

In [0]:
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

## Clear out the model to free up VRAM 

In [0]:
%restart_python