In [5]:
# prepare_dataset.py

import json
from transformers import CamembertTokenizer
import torch

# Load the dataset
texts = [
    "Le patient se plaint de fatigue et de maux de tête.",
    "Il a de la fièvre et des frissons.",
    "Elle ressent des douleurs abdominales et des nausées."
]

labels = [
    ["O", "O", "O", "O", "B-SYMPTOM", "O", "O", "B-SYMPTOM", "I-SYMPTOM", "I-SYMPTOM", "O"],
    ["O", "O", "O", "O", "B-SYMPTOM", "O", "O", "B-SYMPTOM", "O"],
    ["O", "O", "B-SYMPTOM", "I-SYMPTOM", "O", "O", "B-SYMPTOM", "O"]
]

# Load the tokenizer
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

# Define label mapping
tag_values = ["O", "B-SYMPTOM", "I-SYMPTOM"]
tag2idx = {t: i for i, t in enumerate(tag_values)}

# Tokenize the input texts and adjust label encoding
tokenized_texts = []
adjusted_labels = []

for text, doc_labels in zip(texts, labels):
    # Tokenize the text without returning offsets mapping
    tokenized_text = tokenizer(text, is_split_into_words=True)["input_ids"]
    tokenized_texts.append(tokenized_text)
    
    # Adjust label encoding
    adjusted_doc_labels = []
    for i, token_id in enumerate(tokenized_text):
        # Check if the token is a special token (CLS, SEP)
        if i == 0 or i == len(tokenized_text) - 1:
            adjusted_doc_labels.append(-100)  # Special tokens are ignored during training
        else:
            # Access the label only if it's a valid index
            adjusted_doc_labels.append(tag2idx[doc_labels[min(i-1, len(doc_labels)-1)]])
    adjusted_labels.append(adjusted_doc_labels)

# Save the tokenized texts and adjusted labels
torch.save(tokenized_texts, "tokenized_texts.pt")
torch.save(adjusted_labels, "adjusted_labels.pt")

# Save the original texts and labels
with open("dataset.json", "w") as f:
    json.dump({"texts": texts, "labels": labels}, f)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
