# Document Classification with UDOP
In this notebook, we will give a sequential example on data processing and model IO for a document classification example

In [1]:
import torch

from core.models import UdopUnimodelForConditionalGeneration, UdopConfig, \
    UdopTokenizer

# Change path to the model checkpoints
config = UdopConfig.from_pretrained("udop-unimodel-large-224")
tokenizer = UdopTokenizer.from_pretrained("udop-unimodel-large-224")
model = UdopUnimodelForConditionalGeneration.from_pretrained("udop-unimodel-large-224")

OSError: udop-unimodel-large-224 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`.

In [2]:
from PIL import Image
from core.common.utils import img_trans_torchvision, get_visual_bbox
import json
import numpy as np
import torch

# Load example document and process them
im = Image.open('examples/00070353.png').convert('RGB')
im = img_trans_torchvision(im).unsqueeze(0)
visual_seg_data = get_visual_bbox(im.shape[-1]).unsqueeze(0)
annot = json.load(open('examples/00070353.json'))

all_bboxes = []
all_text = []

# Process task prefix, here we use document classification
task_prefix = 'document classification.'
sub_tokens = tokenizer.tokenize(task_prefix)
for sub_token in sub_tokens:
    all_text.append(sub_token)
    all_bboxes.append([0, 0, 0, 0])
        
for item in annot['form']:
    bb = np.array(item['box'])/1000.0
    sub_tokens = tokenizer.tokenize(item['text'])
    for sub_token in sub_tokens:
        all_text.append(sub_token)
        all_bboxes.append(bb)

all_text_ids = tokenizer.convert_tokens_to_ids(all_text)
input_ids = torch.tensor(np.array(all_text_ids)).unsqueeze(0)
input_seg_data = torch.tensor(np.array(all_bboxes)).unsqueeze(0)

In [3]:
# Inference from the inputs
output_ids = model.generate(
            input_ids,
            seg_data=input_seg_data,
            image=im,
            visual_seg_data=visual_seg_data,
            use_cache=True,
            decoder_start_token_id=None,
            num_beams=1,
            max_length=10,
        )
output_text = tokenizer.decode(output_ids[0][1:-1])
output_text



'form'