In [1]:
!pip install -U transformers

Collecting transformers
  Downloading transformers-4.55.4-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.55.4-py3-none-any.whl (11.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m57.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.55.2
    Uninstalling transformers-4.55.2:
      Successfully uninstalled transformers-4.55.2
Successfully installed transformers-4.55.4


In [2]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType

In [3]:
import pandas as pd

In [6]:
df_challenge = pd.read_csv('/content/challenge_data.csv', on_bad_lines='skip', sep=';')

In [7]:
df_challenge.head()

Unnamed: 0,title,abstract,group
0,Adrenoleukodystrophy: survey of 303 cases: bio...,Adrenoleukodystrophy ( ALD ) is a genetically ...,neurological|hepatorenal
1,endoscopy reveals ventricular tachycardia secrets,Research question: How does metformin affect c...,neurological
2,dementia and cholecystitis: organ interplay,Purpose: This randomized controlled study exam...,hepatorenal
3,The interpeduncular nucleus regulates nicotine...,Partial lesions were made with kainic acid in ...,neurological
4,guillain-barre syndrome pathways in leukemia,Hypothesis: statins improves stroke outcomes v...,neurological


In [8]:
print(df_challenge['group'].unique())

['neurological|hepatorenal' 'neurological' 'hepatorenal' 'cardiovascular'
 'neurological|oncological' 'cardiovascular|hepatorenal' 'oncological'
 'neurological|cardiovascular' 'cardiovascular|oncological'
 'neurological|hepatorenal|oncological'
 'neurological|cardiovascular|hepatorenal' 'hepatorenal|oncological'
 'cardiovascular|hepatorenal|oncological'
 'neurological|cardiovascular|hepatorenal|oncological'
 'neurological|cardiovascular|oncological']


In [9]:
labels = ["neurological", "hepatorenal", "cardiovascular", "oncological"]

In [10]:
model_checkpoint = "dmis-lab/biobert-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def encode_labels(label_str):
    onehot = [0] * len(labels)
    for lbl in label_str.split("|"):
        idx = labels.index(lbl)
        onehot[idx] = 1
    return onehot

df_challenge["labels"] = df_challenge["group"].apply(encode_labels)

# Crear Dataset de HuggingFace
dataset = Dataset.from_pandas(df_challenge)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [11]:
dataset

Dataset({
    features: ['title', 'abstract', 'group', 'labels'],
    num_rows: 3565
})

In [12]:
def tokenize(batch):
    batch["text"] = [title + " " + abstract for title, abstract in zip(batch["title"], batch["abstract"])]
    return tokenizer(batch["text"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/3565 [00:00<?, ? examples/s]

In [13]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    problem_type="multi_label_classification", # crucial para la pérdida y activación correctas
    num_labels=len(labels),
    id2label={i: label for i, label in enumerate(labels)},
    label2id={label: i for i, label in enumerate(labels)}
)

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
lora_config = LoraConfig(
    r=16, # Rango de las matrices de adaptación
    lora_alpha=32, # Factor de escala
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS # Especificar el tipo de tarea
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters() # Muestra la reducción de parámetros entrenables


trainable params: 592,900 || all params: 108,906,248 || trainable%: 0.5444


In [15]:
# 3. Definir Argumentos de Entrenamiento y el Trainer
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    eval_strategy="epoch", # Corrected parameter name
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none"
)

In [16]:
import torch # Import torch
from transformers import DataCollatorWithPadding

# Set the format to torch
tokenized_dataset.set_format("torch", columns=['input_ids', 'attention_mask', 'labels'])

# Define a custom data collator to ensure labels are float32
class FloatLabelsDataCollator(DataCollatorWithPadding):
    def __call__(self, features):
        batch = super().__call__(features)
        batch["labels"] = batch["labels"].to(torch.float32)
        return batch

data_collator = FloatLabelsDataCollator(tokenizer=tokenizer)


# Split the dataset into training and validation sets
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
tokenized_train_dataset = train_test_split['train']
tokenized_val_dataset = train_test_split['test']

# Define a simple compute_metrics function (you might want to customize this)
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score


def compute_metrics_function(eval_pred):
    predictions, labels = eval_pred
    # Apply sigmoid to predictions for multi-label classification
    sigmoid = lambda x: 1 / (1 + np.exp(-x))
    predictions = sigmoid(predictions)
    # Convert predictions to binary based on a threshold (e.g., 0.5)
    y_pred = np.where(predictions > 0.5, 1, 0)
    y_true = labels

    accuracy = accuracy_score(y_true.flatten(), y_pred.flatten())
    f1_micro = f1_score(y_true, y_pred, average='micro')
    f1_macro = f1_score(y_true, y_pred, average='macro')
    # AUC is typically calculated for each label separately in multi-label classification
    # This is a simplified example and might need adjustment based on your needs
    try:
        auc_score = roc_auc_score(y_true, predictions, average='macro')
    except ValueError:
        auc_score = 0.0 # Handle cases where AUC cannot be calculated

    return {
        "accuracy": accuracy,
        "f1_micro": f1_micro,
        "f1_macro": f1_macro,
        "roc_auc_macro": auc_score,
    }


# Now create the Trainer with the split datasets and the compute_metrics function
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_function,
    data_collator=data_collator, # Pass the custom data collator
)

  trainer = Trainer(


In [17]:
# 4. Iniciar el entrenamiento
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,Roc Auc Macro
1,No log,0.594319,0.69986,0.311897,0.147193,0.601957
2,0.608300,0.567887,0.733871,0.46662,0.286119,0.698993
3,0.557300,0.473856,0.795582,0.615689,0.390477,0.843281
4,0.557300,0.398334,0.838008,0.709434,0.548701,0.882921
5,0.439400,0.349438,0.871318,0.77958,0.635892,0.904518
6,0.364000,0.318708,0.887097,0.812135,0.7486,0.91347
7,0.364000,0.302531,0.897265,0.831706,0.796252,0.917845
8,0.324200,0.291167,0.90007,0.837422,0.807223,0.921323
9,0.302100,0.286469,0.902174,0.840662,0.813369,0.922764
10,0.299000,0.2845,0.901823,0.840547,0.813601,0.923354


TrainOutput(global_step=3570, training_loss=0.4109358624583867, metrics={'train_runtime': 2267.0867, 'train_samples_per_second': 12.58, 'train_steps_per_second': 1.575, 'total_flos': 7556008056913920.0, 'train_loss': 0.4109358624583867, 'epoch': 10.0})

In [23]:
from huggingface_hub import notebook_login

In [24]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [26]:
# Sube tu modelo a Hugging Face Hub
model.push_to_hub("Hiver77/MDT")
tokenizer.push_to_hub("Hiver77/MDT")

# Luego crea un Space en huggingface.co/spaces

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...pjrn4tveo/adapter_model.safetensors:  22%|##2       |  534kB / 2.38MB            

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Hiver77/MDT/commit/91140b25c9a7c81e91fc4c2b4d36b12f660f346e', commit_message='Upload tokenizer', commit_description='', oid='91140b25c9a7c81e91fc4c2b4d36b12f660f346e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Hiver77/MDT', endpoint='https://huggingface.co', repo_type='model', repo_id='Hiver77/MDT'), pr_revision=None, pr_num=None)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [80]:
def mostrar_resultados(df_resultados, num_ejemplos=5):
    """
    Muestra los primeros resultados de manera legible
    """
    print(f"\n=== PRIMEROS {num_ejemplos} RESULTADOS ===")

    # Check and print the id2label mapping from the model if available
    if hasattr(model_iver, 'config') and hasattr(model_iver.config, 'id2label'):
        print("\nModel's id2label mapping:")
        print(model_iver.config.id2label)
    else:
        print("\nModel does not have an id2label mapping in config. Using the 'labels' list.")
        print(f"Labels list: {labels}")


    for i in range(min(num_ejemplos, len(df_resultados))):
        row = df_resultados.iloc[i]
        print(f"\n--- Artículo {i+1} ---")
        print(f"Título: {row['title']}")
        print(f"Abstract: {str(row['abstract'])[:150]}...")
        print("Clasificaciones:")

        # Mostrar las top clasificaciones
        for j in range(1, 4):  # Asumiendo top_3
            categoria_col = f'top_{j}_categoria'
            prob_col = f'top_{j}_probabilidad'
            if categoria_col in row:
                predicted_label_key = row[categoria_col] # This will be like 'LABEL_0'
                try:
                    # Extract the index from the predicted_label_key (e.g., 'LABEL_0' -> 0)
                    label_index = int(predicted_label_key.split('_')[-1])
                    # Use the index to get the label name from the 'labels' list
                    if label_index < len(labels):
                        categoria = labels[label_index]
                    else:
                        categoria = predicted_label_key # Fallback if index is out of bounds
                except (ValueError, IndexError):
                    categoria = predicted_label_key # Fallback if key format is unexpected

                prob = row[prob_col]
                print(f"  {j}. {categoria}: {prob:.4f} ({prob*100:.1f}%)")

Cargando datos desde /content/test.csv...
Datos cargados: 9 artículos encontrados
Combinando títulos y abstracts...
Iniciando clasificación...


Clasificando artículos: 100%|██████████| 1/1 [00:08<00:00,  8.71s/it]

Procesando resultados...
Resultados guardados en: clasificaciones_resultados.csv

=== PRIMEROS 5 RESULTADOS ===

Model's id2label mapping:
{0: 'LABEL_0', 1: 'LABEL_1', 2: 'LABEL_2', 3: 'LABEL_3'}

--- Artículo 1 ---
Título: Adrenoleukodystrophy: survey of 303 cases: biochemistry, diagnosis, and therapy.
Abstract: Adrenoleukodystrophy ( ALD ) is a genetically determined disorder associated with progressive central demyelination and adrenal cortical insufficiency...
Clasificaciones:
  1. neurological: 0.9416 (94.2%)
  2. hepatorenal: 0.1459 (14.6%)
  3. oncological: 0.1244 (12.4%)

--- Artículo 2 ---
Título: endoscopy reveals ventricular tachycardia secrets
Abstract: Research question: How does metformin affect cancer through pituitary adenoma mechanisms? Methods: randomized controlled study with 53 elderly patient...
Clasificaciones:
  1. neurological: 0.9397 (94.0%)
  2. hepatorenal: 0.0872 (8.7%)
  3. oncological: 0.0861 (8.6%)

--- Artículo 3 ---
Título: dementia and cholecystitis: o


