In [None]:
!pip install kubeflow-training

In [None]:
!pip install -U transformers datasets accelerate evaluate

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    AdamW,
    get_scheduler,
)

import transformers


from datasets import (
    Dataset,
    load_dataset
)

import torch

from torch.utils.data import DataLoader

import evaluate

import numpy as np

In [None]:
def train_func():
    import torch
    import os
    import evaluate
    import transformers
    from transformers import (
      AutoTokenizer,
      AutoModelForSequenceClassification,
      DataCollatorWithPadding,
      Trainer,
      TrainingArguments,
      AdamW,
      get_scheduler,
    )

    from datasets import (
        Dataset,
        load_dataset
    )

    def preprocess_dataset(examples):
      input = [f"{qt} {qc}? {ba}" for qt, qc, ba in zip(examples["question_title"], examples["question_content"], examples["best_answer"])]
      tokenized = tokenizer(input, truncation=True, padding=False)
      return {"input_ids": tokenized["input_ids"], "attention_mask": tokenized["attention_mask"], "labels": examples["topic"]}

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return metric.compute(predictions=predictions, references=labels)

    model_name = "FacebookAI/roberta-large"
    dataset_name = "yahoo_answers_topics"

    metric = evaluate.load("f1")

    # Download dataset.
    dataset = load_dataset(dataset_name, split="train[:20%]")
    label_names = dataset.features['topic'].names
    id2label = dict(enumerate(label_names))
    label2id = {label: i for i, label in id2label.items()}

    # Create model & tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                              num_labels=len(label_names),
                                                              id2label=id2label,
                                                              label2id=label2id,)

    #preprocess dataset
    tokenized_dataset = dataset.map(preprocess_dataset, batched=True, remove_columns=["id", "topic", "question_title", "question_content", "best_answer"])

    train_validation = tokenized_dataset.train_test_split(test_size=0.1, seed=0)

    ready_dataset = datasets.DatasetDict({
        "train": train_validation['train'],
        "validation": train_validation['test'],
        "test": load_dataset(dataset_name, split="test[:5%]").map(preprocess_dataset, batched=True, remove_columns=["id", "topic", "question_title", "question_content", "best_answer"]),
    })

    # Attach model to PyTorch distributor.
    torch.distributed.init_process_group(backend="nccl")
    Distributor = torch.nn.parallel.DistributedDataParallel
    model = Distributor(model)

    ###----1----###
    #setting-up the trainer object - this part probably won't work -> we might need to move a level down - to pure torch training
    trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir='models-roberta',
        evaluation_strategy='epoch',
        save_strategy='epoch',
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=2,
        weight_decay=0.01,
        metric_for_best_model='f1',
        load_best_model_at_end=True,
        report_to='none',
    ),
    train_dataset=ready_dataset["train"],
    eval_dataset=ready_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
    )
    # Start model training.
    trainer.train()
    ###----1----###

    ###----2----###
    #alternate version of setting-up training - pure pytorch - has a higher chance of working than the one above
    ready_datasets.set_format("torch")
    train_dataloader = DataLoader(ready_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator)
    eval_dataloader = DataLoader(ready_datasets["validation"], batch_size=8, collate_fn=data_collator)
    optimizer = AdamW(model.parameters(), lr=5e-5)

    num_epochs = 2
    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)

    model.train()
    #train the model with pytorch
    for epoch in range(num_epochs):
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

    #eval model with pytorch
    model.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])

    metric.compute()
    ###----2----###

In [None]:
# Start PyTorchJob with 100 Workers and 2 GPUs per Worker.
from kubeflow.training import TrainingClient
TrainingClient().create_job(
    name="pytorch-ddp",
    func=train_func,
    num_workers=100,
    resources_per_worker={"gpu": "2"},
)