In [1]:
import os
import torch
from transformers import LayoutLMv3Processor, LayoutLMv3ForSequenceClassification
from torch.utils.data import Dataset, DataLoader, random_split
from PIL import Image, ImageDraw, ImageFont
import pandas as pd


In [2]:


# Load the processor and model
processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")
model = LayoutLMv3ForSequenceClassification.from_pretrained("microsoft/layoutlmv3-base", num_labels=1)

# Load your document image
image_path = "./pic1/4.jpg"  # Replace with your image path
image = Image.open(image_path).convert("RGB")

# Create dummy text (LayoutLMv3 requires text input)
text = "LMAO"

# Process the image
inputs = processor(images=image, text=text, return_tensors="pt")

print(type(inputs))
# Get the score
with torch.no_grad():
    outputs = model(**inputs)
    score = outputs.logits.item()

# Print the score
print(f"Predicted score for the image: {score}")

Some weights of LayoutLMv3ForSequenceClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<class 'transformers.tokenization_utils_base.BatchEncoding'>




Predicted score for the image: -0.10572996735572815


In [3]:

data_dir = "output"
images = []
labels = []

for subdir in os.listdir(data_dir):
    for i in os.listdir(os.path.join(data_dir, subdir)):
        if '.png' in i:
            images.append(os.path.join(data_dir, subdir, i))
            labels.append(i.split('_')[0])
        if len(images) > 10:
            break




data = pd.DataFrame.from_dict({'image_path': images, 'label': labels})
# data.head()

In [4]:
from datasets import Dataset 

# read dataframe as HuggingFace Datasets object
dataset = Dataset.from_pandas(data)
dataset

Dataset({
    features: ['image_path', 'label'],
    num_rows: 15
})

In [18]:
inputs.keys()

dict_keys(['input_ids', 'attention_mask', 'bbox', 'pixel_values'])

In [43]:
from datasets import Features, Sequence, Value, Array2D, Array3D
# we need to define custom features
features = Features({
    'pixel_values': Array3D(dtype="float32", shape=(3, 224, 224)),
    'input_ids': Sequence(feature=Value(dtype='int64')),
    'attention_mask': Sequence(Value(dtype='int64')),
    # 'token_type_ids': Sequence(Value(dtype='int64')),
    'bbox': Array2D(dtype="int64", shape=(512, 4)),
    'labels': Value(dtype='float32')  # метки для задачи регрессии
})

def preprocess_data(examples):
  # take a batch of images
  images = [Image.open(path).convert("RGB") for path in examples['image_path']]
  
  encoded_inputs = processor(images, padding="max_length", truncation=True)
  
  # add labels
  encoded_inputs["labels"] = [label for label in examples["label"]]

  return encoded_inputs

encoded_dataset = dataset.map(preprocess_data, remove_columns=dataset.column_names, features=features, 
                              batched=True, batch_size=2)

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

In [44]:
from datasets import Dataset

# Assuming 'dataset' is your Dataset object
train_test_split = encoded_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [45]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test",
                                  max_steps=1000,
                                  per_device_train_batch_size=2,
                                  per_device_eval_batch_size=2,
                                  learning_rate=1e-5,
                                  evaluation_strategy="steps",
                                  eval_steps=100,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="f1", 
                                  dataloader_pin_memory=False
                                  )


from transformers.data.data_collator import default_data_collator

# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=processor,
    data_collator=default_data_collator,
    # compute_metrics=compute_metrics,
)


trainer.train()

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss
100,No log,78.932442
200,No log,32.508392
300,No log,14.774152
400,No log,7.689963
500,43.518400,4.256772


KeyError: 'eval_f1'

In [None]:
from transformers import AdamW
from tqdm.notebook import tqdm

optimizer = AdamW(model.parameters(), lr=5e-5)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

global_step = 0
num_train_epochs = 10
t_total = len(dataloader) * num_train_epochs # total number of training steps 

#put the model in training mode
model.train()
for epoch in range(num_train_epochs):
  print("Epoch:", epoch)
  running_loss = 0.0
  correct = 0
  for batch in tqdm(train_dataset):
      print(batch)
      
      # forward pass
      outputs = model(**batch)
      loss = outputs.loss

      running_loss += loss.item()
      predictions = outputs.logits.argmax(-1)
      correct += (predictions == batch['labels']).float().sum()

      # backward pass to get the gradients 
      loss.backward()

      # update
      optimizer.step()
      optimizer.zero_grad()
      global_step += 1
  
  print("Loss:", running_loss / batch["input_ids"].shape[0])
  accuracy = 100 * correct / len(data)
  print("Training accuracy:", accuracy.item())

Epoch: 0


  0%|          | 0/12 [00:00<?, ?it/s]

{'pixel_values': tensor([[[0.9922, 0.9922, 0.9922,  ..., 0.9922, 0.9922, 0.9922],
         [0.9922, 0.9922, 0.9922,  ..., 0.9922, 0.9922, 0.9922],
         [0.9922, 0.9922, 0.9922,  ..., 0.9922, 0.9922, 0.9922],
         ...,
         [0.9922, 0.9922, 0.9922,  ..., 0.9922, 0.9922, 0.9922],
         [0.9922, 0.9922, 0.9922,  ..., 0.9922, 0.9922, 0.9922],
         [0.9922, 0.9922, 0.9922,  ..., 0.9922, 0.9922, 0.9922]],

        [[0.9922, 0.9922, 0.9922,  ..., 0.9922, 0.9922, 0.9922],
         [0.9922, 0.9922, 0.9922,  ..., 0.9922, 0.9922, 0.9922],
         [0.9922, 0.9922, 0.9922,  ..., 0.9922, 0.9922, 0.9922],
         ...,
         [0.9922, 0.9922, 0.9922,  ..., 0.9922, 0.9922, 0.9922],
         [0.9922, 0.9922, 0.9922,  ..., 0.9922, 0.9922, 0.9922],
         [0.9922, 0.9922, 0.9922,  ..., 0.9922, 0.9922, 0.9922]],

        [[0.9922, 0.9922, 0.9922,  ..., 0.9922, 0.9922, 0.9922],
         [0.9922, 0.9922, 0.9922,  ..., 0.9922, 0.9922, 0.9922],
         [0.9922, 0.9922, 0.9922,  ..., 0

ValueError: not enough values to unpack (expected 2, got 1)

In [None]:
import os
import torch
from torch.utils.data import Dataset
from PIL import Image

class CustomDataset(Dataset):
    def __init__(self, data_dir="output", transform=None):
        self.transform = transform
        self.image_paths = []
        self.labels = []
        
        for dir in os.listdir(data_dir):
            dir_path = os.path.join(data_dir, dir)
            if os.path.isdir(dir_path):
                for filename in os.listdir(dir_path):
                    if filename.endswith('.png'):
                        self.image_paths.append(os.path.join(dir_path, filename))
                        label = filename.split('_')[0]
                        self.labels.append(float(label))  # Convert to float here
        
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)
        
        label = torch.tensor(self.labels[idx], dtype=torch.float).unsqueeze(0)  # Ensure label is a tensor
        return image, label



In [None]:
from datasets import Dataset as HFDataset
from torchvision import transforms

# Define your transform
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Adjust as needed
    transforms.ToTensor(),
])

# Create an instance of your custom dataset
custom_dataset = CustomDataset(data_dir="output", transform=transform)

# Split the dataset
train_size = 0.8
indices = list(range(len(custom_dataset)))
train_indices, val_indices = train_test_split(indices, train_size=train_size, random_state=42)

train_dataset = torch.utils.data.Subset(custom_dataset, train_indices)
val_dataset = torch.utils.data.Subset(custom_dataset, val_indices)

# Convert to Hugging Face dataset
def dataset_to_dict(dataset):
    data_dict = {"image": [], "label": []}
    for img, lbl in dataset:
        data_dict["image"].append(img.numpy())
        data_dict["label"].append(lbl.item())
    return data_dict

hf_dataset_train = HFDataset.from_dict(dataset_to_dict(train_dataset))
hf_dataset_val = HFDataset.from_dict(dataset_to_dict(val_dataset))



In [None]:
hf_dataset_train[0]

{'image': [[[1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
    1.0,
 

In [None]:
# Load the processor and model
processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")
model = LayoutLMv3ForSequenceClassification.from_pretrained("microsoft/layoutlmv3-base", num_labels=1)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Data collator
def collate_fn(batch):
    for item in batch:
        print(item)
    pixel_values = torch.stack([item['image'] for item in batch])
    labels = torch.tensor([item['label'] for item in batch])
    return {
        'pixel_values': pixel_values,
        'labels': labels,
    }

# Convert Hugging Face dataset to PyTorch DataLoader
train_dataloader = torch.utils.data.DataLoader(hf_dataset_train, batch_size=training_args.per_device_train_batch_size, collate_fn=collate_fn)
val_dataloader = torch.utils.data.DataLoader(hf_dataset_val, batch_size=training_args.per_device_eval_batch_size, collate_fn=collate_fn)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_dataset_train,
    eval_dataset=hf_dataset_val,
    data_collator=collate_fn,
    tokenizer=processor,
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained("layoutlmv3-quality-assessment")
processor.save_pretrained("layoutlmv3-quality-assessment")


Some weights of LayoutLMv3ForSequenceClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'label': 65.88999938964844}
{'label': 29.489999771118164}
{'label': 24.899999618530273}
{'label': 50.810001373291016}


KeyError: 'image'

In [None]:
# # Load the model and processor
# model = LayoutLMv3ForSequenceClassification.from_pretrained("layoutlmv3-quality-assessment")
# processor = LayoutLMv3Processor.from_pretrained("layoutlmv3-quality-assessment")
model.to('cpu')
# Load and preprocess the image
image_path = "./pic1/4_blur.png"
image_path = "./pic1/4.jpg"

image = Image.open(image_path).convert("RGB")
inputs = processor(images=image, text="dummy text", return_tensors="pt")

# Get the score
with torch.no_grad():
    outputs = model(**inputs)
    score = outputs.logits.item()

# Print the score
print(f"Predicted score for the image: {score}")




Predicted score for the image: 16.908662796020508
