# Setting up LLM connection

Let's first setup an OpenAI client


In [1]:
import openai

In [2]:
import json

config = json.load(open("secrets.json"))

openai_client = openai.AzureOpenAI(**config)

This should work and respond something like "The capital of France is Paris."


In [None]:
openai_client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "What is the capital of France?"},
    ],
).choices[0].message.content

# Imports & Cloning repository


In [57]:
import base64
import io
import pickle
import sys
from copy import deepcopy
from dataclasses import dataclass
from os import path

import PIL

In [6]:
class_names = ["email", "form", "handwritten", "invoice", "advertisement"]
NUM_CLASSES = len(class_names)

In [7]:
from src import download_dataset

dataset_path = "dataset"

download_dataset.download_and_extract("all", dataset_path)

In [None]:
with open(path.join(dataset_path, "train.pkl"), "rb") as f:
    train_dataset = pickle.load(f)

with open(path.join(dataset_path, "test.pkl"), "rb") as f:
    test_dataset = pickle.load(f)

with open(path.join(dataset_path, "validation.pkl"), "rb") as f:
    validation_dataset = pickle.load(f)


for split_name, split_dataset in zip(
    ["train", "test", "validation"], [train_dataset, test_dataset, validation_dataset]
):
    print(f"{split_name}_dataset contains {len(split_dataset)} documents")
train_dataset[0].keys()

Each `dataset` object is a `list` containing multiple document information. A document is a `dict` with the following structure:

```json
{
  "id": "Unique document identifier",
  "image": "A PIL.Image object containing the document's image",
  "label": "A number between in [0 .. 4] representing the class of the document",
  "words": "A list of strings (not words !) extracted from the image with an OCR",
  "boxes": "A list of tuples of numbers providing the position of each word in the document"
}
```


Although API-based LLM can be trained, it turns out tu be very costly. However, thanks to their very extensive training, LLM can understand a natural language description of the task (called prompt) and perform successfully the task.


Write a function that will only use the text from the document to perform a prediction using OpenAI PT 4o mini


In [None]:
def text_prediction(document: dict) -> str:
    """Process a document and return the predicted class name."""

In [13]:
# @title


def text_prediction(document: dict) -> str:
    return (
        openai_client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "system",
                    "content": f"You are a helpful assistant. You are helping a user with a text classification task. Respond with a single word which is the class of the document. Available classes are {class_names}.",
                },
                {
                    "role": "user",
                    "content": f"The document contains the following words: {document['words']}",
                },
            ],
        )
        .choices[0]
        .message.content
    )

Let's now evaluate our model on the validation test. Make it reusable and take the `prediction_fn` as input.


In [None]:
def evaluate(
    prediction_fn: Callable[[dict], str], dataset: dict
) -> tuple[float, list[str], list[str]]:
    "Evalute the prediction function on the dataset and return the accuracy, predictions and targets."

In [69]:
def evaluate(
    prediction_fn: Callable[[dict], str], dataset: dict
) -> tuple[float, list[str], list[str]]:
    "Evalute the prediction function on the dataset and return the accuracy."

    predictions = []
    accuracies = []
    targets = [class_names[document["label"]] for document in dataset]
    with tqdm.tqdm(desc="Evaluation", total=len(dataset)) as pbar:
        for document, target in zip(dataset, targets):
            predictions.append(prediction_fn(document))
            accuracies.append(predictions[-1] == target)
            pbar.update(1)
            pbar.set_postfix(metric=f"{sum(accuracies) / len(accuracies):.2f}")

    return sum(accuracies) / len(accuracies), predictions, targets

Let's evaluate GPT-4o-mini on text input and take a look at its predictions. Let's try to build the confusion matrix on the validation set.


In [None]:
acc, preds, tgts = evaluate(text_prediction, validation_dataset)

Depending on your prompt and luck, the following code might break, what's the issue?


In [None]:
preds_idx = [
    class_names.index(pred) if pred in class_names else len(class_names)
    for pred in preds
]
tgts_idx = [class_names.index(tgt) for tgt in tgts]

from sklearn.metrics import confusion_matrix

confusion_matrix(tgts_idx, preds_idx)

Some LLMs like GPT-4o(-mini) are multimodal. They can process multiple modalities of input and some can also produce different output modalities.

GPT-4o-mini can process both text and images for example. Let's write a prediction function providing an image to GPT-4o-mini.

Use the OpenAI documentation to figure out how to write the messages.


In [62]:
def image_to_b64(image: PIL.Image) -> str:
    """Convert a PIL image to a base64 string."""
    io_buf = io.BytesIO()
    image.thumbnail((128, 128))
    image.save(io_buf, format="jpeg")
    return base64.b64encode(io_buf.getvalue()).decode("utf-8")

In [63]:
# @title


def image_prediction(document: dict) -> str:
    base64_image = image_to_b64(deepcopy(document["image"]))
    return (
        openai_client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful assistant. You are helping a user with a text classification task. "
                    "Respond with a single word which is the class of the document. Available classes are {class_names}.",
                },
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "This is an image document."},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}"
                            },
                        },
                    ],
                },
            ],
        )
        .choices[0]
        .message.content
    )

In [None]:
class_names[train_dataset[0]["label"]], image_prediction(train_dataset[0])

In [None]:
import random
