### Importing Libraries

In [1]:
import os
import requests
import torch
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor

### Hugging Face login

In [None]:
from huggingface_hub import login
login()

### Loading the Model and Tokenizer

In [2]:
# Initialize processor and model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_id = "meta-llama/Llama-3.2-11B-Vision"

model = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_id)

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

### Utilities

In [19]:
# Function for OCR
def ocr_image(src_img):
    # Get the prediction from the OCR model
    prompt = "<|image|>Perform OCR on the image and extract the text string"
    inputs = processor(src_img, prompt, return_tensors="pt").to(device)

    # Generate output
    output_ids = model.generate(**inputs,max_new_tokens=50)
    output_text = processor.decode(output_ids[0], skip_special_tokens=True)
    
    return output_text.strip()

### Loading the cluttering images

In [None]:
parent_dir = '../sentence_images/cluttered'

# looping through folders
folder_list = os.listdir(parent_dir)

### Inferencing the model

In [None]:
page, idx, sentence = [], [], []

for folder in folder_list :
    files = os.listdir(os.path.join(parent_dir, folder))

    img_files = [f for f in files if os.path.isfile(os.path.join(parent_dir, folder, f))]
    for img in files:
        page.append(folder)
        idx.append(img[:-4])

        full_image = Image.open(os.path.join(parent_dir, folder, img)).convert("RGB")
        text = ocr_image(full_image)
        sentence.append(text)

data = {
        'page' : page,
        'index' : idx,
        'sentence' : sentence
    }

df = pd.DataFrame(data)
df.to_csv("Llama_3_2_11B_vision_ocr.csv", mode='a', header=False, index=False)

### Inferencing for the images

In [None]:
path = '../sentence_images_/cluttered_/64/6.png'
full_image = Image.open(path).convert("RGB")
text = ocr_image(full_image)
print(text)

Perform OCR on the image and extract the text string. <OCR/> to return his rammer, to present, prese nt, to to return his rammer, to present, to to to to to to to to to to to to to to to to to to to to to to to


: 