In [15]:
from datasets import load_dataset
from transformers import BertTokenizer

# Charger le dataset Swissprot-families
dataset = load_dataset("khairi/Swissprot-families")

# Utiliser le tokenizer BERT pré-entrainé
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples["Sequence"], padding="max_length", truncation=True)

# Tokenizer les séquences
tokenized_datasets = dataset.map(tokenize_function, batched=True)


In [16]:
# Extraire les labels uniques de tous les splits
all_labels = set(tokenized_datasets["train"]["Organism"])
all_labels.update(tokenized_datasets["validation"]["Organism"])
all_labels.update(tokenized_datasets["test"]["Organism"])

# Convertir en liste triée pour avoir un ordre consistant
labels = sorted(list(all_labels))
num_labels = len(labels)

# Encoder les labels
def encode_labels(examples):
    examples["label"] = [labels.index(label) for label in examples["Organism"]]
    return examples

# Ajouter les labels encodés au dataset tokenisé
tokenized_datasets = tokenized_datasets.map(encode_labels, batched=True)


Map: 100%|██████████| 440156/440156 [00:45<00:00, 9729.77 examples/s] 
Map: 100%|██████████| 100/100 [00:00<00:00, 1375.76 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 10114.54 examples/s]


In [17]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# Charger le modèle BERT pré-entrainé avec un classificateur de séquence
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Configurer les arguments de l'entrainement
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Créer le Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

# Entrainement du modèle
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 8/82530 [10:09<1713:26:42, 74.75s/it]

KeyboardInterrupt: 

In [None]:
# Évaluation sur le dataset de test
results = trainer.evaluate(tokenized_datasets["test"])
print(results)
