In [34]:
import random

from transformers import LayoutLMv3ForTokenClassification, LayoutLMv3TokenizerFast
import torch
from PIL import Image
import requests
import pandas as pd
import numpy as np

if torch.cuda.is_available():
    device = torch.device("cpu")  # Use the first GPU
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")  # Fallback to CPU
    print("Using CPU")

Using GPU: NVIDIA GeForce RTX 4070 SUPER


In [35]:
df = pd.read_csv('pic6_df.csv')
df

Unnamed: 0,txt,confidence,vertices,boundBox,slopes,width,height,direction,mid_point,crop_idx,Left,Top
0,ated,100,"[(0, 187), (74, 194), (71, 226), (0, 219)]","{'Width': 74, 'Height': 39, 'Left': 0, 'Top': ...","(0.034, -3.832)",32.140317,74.330344,horizontal,"(36.25, 206.5)","(0, 0)",0,187
1,808.3,100,"[(184, 1168), (242, 1169), (242, 1183), (184, ...","{'Width': 58, 'Height': 15, 'Left': 184, 'Top'...","(0.006, -300.0)",14.000000,58.008620,horizontal,"(213.0, 1175.5)","(0, 0)",184,1168
2,BUC,100,"[(185, 1193), (218, 1194), (218, 1207), (185, ...","{'Width': 33, 'Height': 14, 'Left': 185, 'Top'...","(0.011, -300.0)",13.000000,33.015148,horizontal,"(201.5, 1200.0)","(0, 0)",185,1193
3,A,100,"[(219, 349), (219, 384), (187, 384), (187, 349)]","{'Width': 32, 'Height': 35, 'Left': 187, 'Top'...","(-300.0, -0.0)",32.000000,35.000000,vertical,"(203.0, 366.5)","(0, 0)",187,349
4,Writer's,100,"[(219, 398), (221, 557), (189, 557), (187, 398)]","{'Width': 34, 'Height': 159, 'Left': 187, 'Top...","(28.558, -0.0)",32.000000,159.012578,vertical,"(204.0, 477.5)","(0, 0)",187,398
...,...,...,...,...,...,...,...,...,...,...,...,...
421,POETRY,100,"[(3871, 560), (3859, 638), (3839, 635), (3851,...","{'Width': 32, 'Height': 81, 'Left': 3839, 'Top...","(-2.335, 0.054)",20.223748,78.917679,vertical,"(3855.0, 597.5)","(0, 0)",3839,557
422,OF,100,"[(3878, 518), (3874, 543), (3853, 540), (3857,...","{'Width': 25, 'Height': 28, 'Left': 3853, 'Top...","(-2.245, 0.051)",21.213203,25.317978,vertical,"(3865.5, 529.0)","(0, 0)",3853,515
423,BOOK,100,"[(3888, 451), (3880, 503), (3860, 500), (3868,...","{'Width': 28, 'Height': 55, 'Left': 3860, 'Top...","(-2.335, 0.054)",20.223748,52.611786,vertical,"(3874.0, 475.5)","(0, 0)",3860,448
424,THE,100,"[(3896, 396), (3890, 436), (3870, 432), (3876,...","{'Width': 26, 'Height': 43, 'Left': 3870, 'Top...","(-2.395, 0.072)",20.396078,40.447497,vertical,"(3883.0, 414.25)","(0, 0)",3870,393


In [36]:
# Load pre-trained LayoutLMv3 model and tokenizer
model_name = "microsoft/layoutlmv3-base"
model = LayoutLMv3ForTokenClassification.from_pretrained(model_name)
tokenizer = LayoutLMv3TokenizerFast.from_pretrained(model_name)
model.to(device)

# Example data (replace with your actual data)
image_url = './pics/IMG_7940.jpeg'
image = Image.open(image_url)
words = df['txt'].tolist()
vertices = df['vertices'].tolist()
vertices = [eval(lst) for lst in vertices]
boxes = df['boundBox'].tolist()
boxes = [eval(lst) for lst in boxes]

Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [50]:
bounding_boxes = []
for item in boxes:
    left = item['Left']
    top = item['Top']
    right = left + item['Width']
    bottom = top + item['Height']
    bounding_boxes.append([left, top, right, bottom])
    
    
def normalize_bbox(bbox, img_width, img_height):
    """
    Normalize bounding box coordinates to a scale of 0-1000.
    Parameters:
        bbox (list): The bounding box in [left, top, right, bottom] format.
        img_width (int): The width of the image.
        img_height (int): The height of the image.
    Returns:
        list: Normalized bounding box.
    """
    left, top, right, bottom = bbox
    normalized_left = (left / img_width) * 1000
    normalized_top = (top / img_height) * 1000
    normalized_right = (right / img_width) * 1000
    normalized_bottom = (bottom / img_height) * 1000
    return [normalized_left, normalized_top, normalized_right, normalized_bottom]

# Assuming maximum values for width and height from the given data
max_width = image.width
max_height = image.height

# Normalize each bounding box
normalized_bboxes = [normalize_bbox(bbox, max_width, max_height) for bbox in bounding_boxes]
normalized_bboxes_long = torch.tensor(normalized_bboxes, dtype=torch.long)
normalized_bboxes_long

tensor([[  0, 129,  18, 156],
        [ 45, 806,  60, 816],
        [ 45, 823,  54, 833],
        ...,
        [957, 309, 964, 347],
        [960, 271, 966, 301],
        [983, 157, 999, 205]])

In [51]:
# Preprocess image and text
encoding = tokenizer(words, boxes=normalized_bboxes_long, return_tensors="pt", truncation=True, padding="max_length")

In [52]:
encoding.keys()

dict_keys(['input_ids', 'attention_mask', 'bbox'])

In [53]:
# Move to the same device as the model
input_ids = encoding["input_ids"].to(device)
attention_mask = encoding["attention_mask"].to(device)
# token_type_ids = encoding["token_type_ids"].to(device)
bbox = encoding["bbox"].to(model.device)

# Inference
with torch.no_grad():
    outputs = model(input_ids=input_ids, attention_mask=attention_mask, bbox=bbox)
    logits = outputs.logits



In [54]:
logits

tensor([[[-0.2103, -0.9563],
         [ 0.3301, -0.9911],
         [ 0.0620, -0.9061],
         ...,
         [-0.1748, -0.1627],
         [-0.4078,  0.2019],
         [-0.2109, -0.9542]]])

In [57]:
import torch.nn.functional as F

def predict_groups(model, tokenizer, words, boxes, threshold=0.9):
    # Initial full prediction
    encoding = tokenizer(words, boxes=boxes, return_tensors="pt", truncation=True, padding="max_length")
    inputs = {k: v.to(model.device) for k, v in encoding.items()}
    with torch.no_grad():
        logits = model(**inputs).logits
    probabilities = F.softmax(logits, dim=-1)

    grouped_words = []
    remaining_words = words[:]
    remaining_boxes = boxes[:]
    all_groups = []

    while remaining_words:
        predicted_classes = torch.argmax(probabilities, dim=-1)[0]
        class_confidences = probabilities.max(dim=-1).values[0]

        # Create groups for this iteration
        current_groups = {}
        for idx, (class_id, confidence) in enumerate(zip(predicted_classes, class_confidences)):
            if idx in encoding.token_to_word(0) and confidence > threshold:
                word_idx = encoding.token_to_word(0)[idx]
                if word_idx < len(remaining_words):  # Check to avoid index errors
                    word = remaining_words[word_idx]
                    group_id = class_id.item()

                    if group_id not in current_groups:
                        current_groups[group_id] = []
                    current_groups[group_id].append(word)
                    grouped_words.append(word_idx)

        # Add current groups to all groups
        all_groups.append(current_groups)

        # Prepare for next iteration by removing grouped words
        remaining_words = [word for i, word in enumerate(remaining_words) if i not in grouped_words]
        remaining_boxes = [box for i, box in enumerate(remaining_boxes) if i not in grouped_words]
        if not remaining_words:
            break  # Exit if no remaining words

        # Repeat prediction with remaining words
        encoding = tokenizer(remaining_words, boxes=remaining_boxes, return_tensors="pt", truncation=True, padding="max_length")
        inputs = {k: v.to(model.device) for k, v in encoding.items()}
        with torch.no_grad():
            logits = model(**inputs).logits
        probabilities = F.softmax(logits, dim=-1)

    return all_groups

In [58]:
# Usage example
all_groups = predict_groups(model, tokenizer, words, boxes)
for iteration, groups in enumerate(all_groups):
    print(f"Iteration {iteration + 1}:")
    for group_id, words in groups.items():
        print(f"  Group {group_id}: {' '.join(words)}")

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`bbox` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [114]:
choice = [(0, 0), (1, 0), (2, 0), (0, -1)]
probs = [0.25, 0.25, 0.25, 0.25]

In [115]:
def random_choices_with_probabilities(choices=choice, prob=probs, n=1):
    """
    Generate n random choices from a list of choices based on given probabilities.

    Parameters:
    choices (list): A list of possible choices.
    probabilities (list): A list of probabilities corresponding to each choice.
    n (int): Number of random choices to generate.

    Returns:
    list: A list of n random choices based on the given probabilities.
    """
    indices = np.random.choice(len(choices), size=n, p=prob)
    return [choices[index] for index in indices]

            
random_choices_with_probabilities()

[(0, -1)]