In [1]:
# Install required packages
!pip install transformers datasets sentencepiece



In [18]:
# TaskC_Train_BART.ipynb

from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
import torch

# 1. Load data
train_df = pd.read_csv("../dataset/task_b+c/data/challenge_data/train.csv")
val_df = pd.read_csv("../dataset/task_b+c/data/challenge_data/valid.csv")

# 2. Prepare data
train_df = train_df[['note', 'dialogue']].rename(columns={'note': 'input_text', 'dialogue': 'target_text'})
val_df = val_df[['note', 'dialogue']].rename(columns={'note': 'input_text', 'dialogue': 'target_text'})

train_df['input_text'] = "Note: " + train_df['input_text'] + "\n\nGenerate a doctor-patient conversation:"
val_df['input_text'] = "Note: " + val_df['input_text'] + "\n\nGenerate a doctor-patient conversation:"

# 3. HuggingFace dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# 4. Load BART model and tokenizer
model_name = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# 5. Tokenize function
def tokenize(example):
    model_inputs = tokenizer(
        example["input_text"],
        max_length=512,
        truncation=True,
        padding="max_length"
    )
    labels = tokenizer(
        example["target_text"],
        max_length=512,
        truncation=True,
        padding="max_length"
    )

    # Important: mask PAD tokens in the labels
    labels["input_ids"] = [
        token if token != tokenizer.pad_token_id else -100
        for token in labels["input_ids"]
    ]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# 6. Apply tokenization
tokenized_train = train_dataset.map(tokenize)
tokenized_val = val_dataset.map(tokenize)

# 7. Training arguments
training_args = TrainingArguments(
    output_dir="./results_taskC_bart",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=8,
    weight_decay=0.01,
    logging_dir='./logs_taskC_bart',
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="none"
)

# 8. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer
)

# 9. Train
trainer.train()

# 10. Save the model
model.save_pretrained("./results_taskC_bart")
tokenizer.save_pretrained("./results_taskC_bart")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Map: 100%|██████████| 67/67 [00:01<00:00, 46.47 examples/s]
Map: 100%|██████████| 20/20 [00:00<00:00, 44.74 examples/s]
  trainer = Trainer(
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch,Training Loss,Validation Loss
1,3.1743,2.709176
2,2.744,2.481676
3,2.4195,2.411508
4,2.298,2.366663
5,2.2679,2.350805
6,2.148,2.342839
7,2.0951,2.32012
8,1.9945,2.317264


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


('./results_taskC_bart\\tokenizer_config.json',
 './results_taskC_bart\\special_tokens_map.json',
 './results_taskC_bart\\vocab.json',
 './results_taskC_bart\\merges.txt',
 './results_taskC_bart\\added_tokens.json')

In [17]:
model.save_pretrained("./results_taskC")
tokenizer.save_pretrained("./results_taskC")


('./results_taskC\\tokenizer_config.json',
 './results_taskC\\special_tokens_map.json',
 './results_taskC\\spiece.model',
 './results_taskC\\added_tokens.json')