## Intro

Let's use an existing Prodigy dataset in our local Prodigy database and train it on Hugging Face `transformers` step by step.

In [4]:
from prodigy_hf.textcat import produce_train_eval_datasets,into_hf_format

data = "hmwk2-train,eval:hmwk2-eval-review"

train_examples, valid_examples, variant = produce_train_eval_datasets(data)

valid_examples[0]

{'text': "Don't tell that to Anthony Bourdain.",
 'meta': {'source': 'homework2-DSBA6188-UNCC'},
 '_input_hash': -1138854585,
 '_task_hash': -1083063769,
 'options': [{'id': 'RELEVANT', 'text': 'RELEVANT'},
  {'id': 'NOT_RELEVANT', 'text': 'NOT_RELEVANT'}],
 '_view_id': 'choice',
 'config': {'choice_style': 'single'},
 'accept': ['NOT_RELEVANT'],
 'answer': 'accept',
 '_timestamp': 1708026240,
 '_annotator_id': 'hmwk2-eval-ryan',
 '_session_id': 'hmwk2-eval-ryan',
 'sessions': ['hmwk2-eval-chang', 'hmwk2-eval-ryan'],
 'versions': [{'text': "Don't tell that to Anthony Bourdain.",
   'meta': {'source': 'homework2-DSBA6188-UNCC'},
   '_input_hash': -1138854585,
   '_task_hash': -1083063769,
   'options': [{'id': 'RELEVANT', 'text': 'RELEVANT'},
    {'id': 'NOT_RELEVANT', 'text': 'NOT_RELEVANT'}],
   '_view_id': 'choice',
   'config': {'choice_style': 'single'},
   'accept': ['NOT_RELEVANT'],
   'answer': 'accept',
   '_timestamp': 1708026240,
   '_annotator_id': 'hmwk2-eval-ryan',
   '_se

In [5]:
print(f"Training examples have {len(train_examples)} examples")
print(f"Validation examples have {len(valid_examples)} examples")

Training examples have 500 examples
Validation examples have 200 examples


In [7]:
from typing import Dict, Iterable, List, Literal, Optional

def get_label_names(examples: List[Dict], variant: Literal["binary", "multi"]) -> List[str]:
    """We have to assume exclusive textcat here. So the first example contains all labels."""
    if variant == "multi":
        return [ex['id'] for ex in examples[0]['options']]
    return ["accept", "reject"]

def into_hf_format2(train_examples: List[Dict], valid_examples: List[Dict], variant: Literal["binary", "multi"]):
    """Turn the examples into variables/format that Huggingface expects."""
    label_names = get_label_names(train_examples, variant)
    id2label = {i: n for i, n in enumerate(label_names)}
    label2id = {n: i for i, n in enumerate(label_names)}

    def generator(examples) -> Iterable[Dict]:
        for ex in examples:
            label = None
            if variant == "binary":
                label = label2id[ex["answer"]]
            if (variant == "multi") and ex['accept']:
                # It could be that the dataset was accepted but didn't have anything selected. 
                label = label2id[ex["accept"][0]]
            if label is not None: 
                yield {
                    "text": ex["text"],
                    "label": label
                }

    train_out = list(generator(train_examples))
    valid_out = list(generator(valid_examples))
    return train_out, valid_out, label_names, id2label, label2id

gen_train, gen_valid, label_list, id2lab, lab2id = into_hf_format2(train_examples, valid_examples, variant)
print(f"Training examples in the file were {len(train_examples)} and {len(gen_train)} examples were converted to HF format.")
print(f"Validation examples in the file were {len(valid_examples)} and {len(gen_valid)} examples were converted to HF format.")
print(id2lab)
print(lab2id)


Training examples in the file were 500 and 500 examples were converted to HF format.
Validation examples in the file were 200 and 200 examples were converted to HF format.
{0: 'NOT_RELEVANT', 1: 'RELEVANT'}
{'NOT_RELEVANT': 0, 'RELEVANT': 1}


In [8]:
# convert to Prodigy dataset
from datasets import Dataset, DatasetDict

prodigy_dataset = DatasetDict(
        train=Dataset.from_list(gen_train),
        eval=Dataset.from_list(gen_valid)
    )

prodigy_dataset['train'][0]

{'text': "A ss pan isn't going to hold it's heat as well as cast iron. If you want the best browning, it's gotta be cast iron. If you're making a pan sauce aftward, then ss is a good choice.",
 'label': 1}

In [9]:
from transformers import AutoTokenizer, DataCollatorWithPadding

model_name = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True)

tokenized_dataset = prodigy_dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 500/500 [00:00<00:00, 8508.61 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 8617.05 examples/s]


[Data collators](https://huggingface.co/docs/transformers/v4.37.2/en/main_classes/data_collator#data-collator) are objects that will form a batch by using a list of dataset elements as input.

In [10]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

data_collator

DataCollatorWithPadding(tokenizer=DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [11]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from prodigy_hf.textcat import build_metrics_func

model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=len(id2lab), id2label=id2lab, label2id=lab2id
)

training_args = TrainingArguments(
    output_dir="./output/experiment-5/", # output directory
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["eval"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=build_metrics_func(label_list),
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<00:00, 17.4MB/s]


In [12]:
import time
print("RECIPE: Starting training.")
tic = time.time()
trainer.train()
toc = time.time()
print(f"RECIPE: Total training time: {round(toc - tic)}s.")

RECIPE: Starting training.


  0%|          | 0/160 [00:00<?, ?it/s]

                                                
 20%|██        | 32/160 [01:49<05:29,  2.57s/it]

{'eval_loss': 0.8563199043273926, 'eval_accuracy': 0.315, 'eval_runtime': 11.7239, 'eval_samples_per_second': 17.059, 'eval_steps_per_second': 1.109, 'epoch': 1.0}


                                                
 40%|████      | 64/160 [03:39<03:41,  2.31s/it]

{'eval_loss': 0.9309298992156982, 'eval_accuracy': 0.59, 'eval_runtime': 11.9687, 'eval_samples_per_second': 16.71, 'eval_steps_per_second': 1.086, 'epoch': 2.0}


                                                
 60%|██████    | 96/160 [05:32<02:34,  2.41s/it]

{'eval_loss': 1.1233174800872803, 'eval_accuracy': 0.615, 'eval_runtime': 11.8562, 'eval_samples_per_second': 16.869, 'eval_steps_per_second': 1.096, 'epoch': 3.0}


                                                 
 80%|████████  | 128/160 [07:24<01:18,  2.45s/it]

{'eval_loss': 1.0534493923187256, 'eval_accuracy': 0.625, 'eval_runtime': 11.9322, 'eval_samples_per_second': 16.761, 'eval_steps_per_second': 1.089, 'epoch': 4.0}


                                                 
100%|██████████| 160/160 [09:19<00:00,  2.61s/it]

{'eval_loss': 1.0957821607589722, 'eval_accuracy': 0.62, 'eval_runtime': 11.7455, 'eval_samples_per_second': 17.028, 'eval_steps_per_second': 1.107, 'epoch': 5.0}


100%|██████████| 160/160 [09:23<00:00,  3.52s/it]

{'train_runtime': 563.3242, 'train_samples_per_second': 4.438, 'train_steps_per_second': 0.284, 'train_loss': 0.353403377532959, 'epoch': 5.0}
RECIPE: Total training time: 564s.





In [13]:
from transformers import pipeline

pipeline = pipeline("text-classification", "./output/experiment-5/checkpoint-128")

pipeline(["My recipes includes bananas, apples, and pears.", "Basketball is my favorite sport."])

[{'label': 'RELEVANT', 'score': 0.972578227519989},
 {'label': 'NOT_RELEVANT', 'score': 0.7652834057807922}]

## Possible Next Steps

1. Upload your datasets to HF datasets (see [Prodigy docs](https://prodi.gy/docs/plugins#hf-hub)); now you can train on Colab with GPU.

```
# Make sure you're logged in with huggingface-cli login first
# change username/reponame to your own HF username and new reponame
prodigy hf.upload hmwk2-train,eval:hmwk2-eval-review username/reponame
```

2. Upload your model to HF Hub

```
# Push model to the Hub
# Make sure you're logged in with huggingface-cli login first
trainer.push_to_hub("my-awesome-model")
```

3. Modify the HF model with a different base model (you'll need to search for possible models)