# Setting up LLM connection

Let's first setup an OpenAI client

In [1]:
import openai

In [5]:
openai_client = openai.AzureOpenAI(
    base_url="API_ENDPOINT",
    api_key="API_KEY",
    api_version="API_VERSION",
)

This should work and respond something like "The capital of France is Paris."

In [9]:
openai_client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "What is the capital of France?"},
    ],
).choices[0].message.content

'The capital of France is Paris.'

# Imports & Cloning repository

In [11]:
import os
import pickle
import sys
from dataclasses import dataclass
from os import path

In [12]:
class_names = ["email", "form", "handwritten", "invoice", "advertisement"]
NUM_CLASSES = len(class_names)

In [13]:
from src import download_dataset

dataset_path = "dataset"

download_dataset.download_and_extract("all", dataset_path)

In [14]:
with open(path.join(dataset_path, "train.pkl"), "rb") as f:
    train_dataset = pickle.load(f)

with open(path.join(dataset_path, "test.pkl"), "rb") as f:
    test_dataset = pickle.load(f)

with open(path.join(dataset_path, "validation.pkl"), "rb") as f:
    validation_dataset = pickle.load(f)


for split_name, split_dataset in zip(
    ["train", "test", "validation"], [train_dataset, test_dataset, validation_dataset]
):
    print(f"{split_name}_dataset contains {len(split_dataset)} documents")
train_dataset[0].keys()

train_dataset contains 5000 documents
test_dataset contains 1000 documents
validation_dataset contains 500 documents


dict_keys(['id', 'image', 'label', 'words', 'boxes'])

Each `dataset` object is a `list` containing multiple document information. A document is a `dict` with the following structure:

```json
{
  "id": "Unique document identifier",
  "image": "A PIL.Image object containing the document's image",
  "label": "A number between in [0 .. 4] representing the class of the document",
  "words": "A list of strings (not words !) extracted from the image with an OCR",
  "boxes": "A list of tuples of numbers providing the position of each word in the document"
}
```
