## Sample training on 2 sentences

In [16]:
import torch
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

sequence = [
    "Well hello there, how's the weather?",
    "I can't wait for the weekend!"
]

batch = tokenizer(sequence, padding=True, truncation=True, return_tensors='pt')
print('Tokenized batch: \n',batch)

## Trainng

batch["labels"] = torch.tensor([1,1])
print('\nBatch with labels: \n',batch)

optimizer = AdamW(model.parameters())
print('\n',optimizer)

loss = model(**batch).loss
print("\n Training loss: \n", loss)

loss.backward()
optimizer.step()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenized batch: 
 {'input_ids': tensor([[ 101, 2092, 7592, 2045, 1010, 2129, 1005, 1055, 1996, 4633, 1029,  102],
        [ 101, 1045, 2064, 1005, 1056, 3524, 2005, 1996, 5353,  999,  102,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])}

Batch with labels: 
 {'input_ids': tensor([[ 101, 2092, 7592, 2045, 1010, 2129, 1005, 1055, 1996, 4633, 1029,  102],
        [ 101, 1045, 2064, 1005, 1056, 3524, 2005, 1996, 5353,  999,  102,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]), 'labels': tensor([1, 1])}

 AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    decoupled_weight_decay: True

## data preprocessing

In [1]:
from datasets import load_dataset

#GLUE Benchmark (10 different classification tasks) Microsoft Research Paraphrase Corpus

data = load_dataset("glue", "mrpc")
data

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [None]:
train_set = data['train']
print(train_set[0])

## get label mapping

train_set.features

In [None]:
## preprocess and turn the text into numnbers
from transformers import AutoTokenizer

tok = AutoTokenizer.from_pretrained('bert-base-uncased')

inp = tok(train_set[0]['sentence1'],train_set[0]['sentence2'])
print('tokenized input: \n',inp)

print('\ndecoded tokens: \n',tok.convert_ids_to_tokens(inp.input_ids))


In [2]:
## on the entier dataset
from transformers import AutoTokenizer

tok = AutoTokenizer.from_pretrained('bert-base-uncased')

inputs = tok(
    data['train']['sentence1'][:],
    data['train']['sentence2'][:], padding=True, truncation=True)
inputs[:10]


[Encoding(num_tokens=103, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=103, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=103, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=103, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=103, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=103, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=103, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=103, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=103

In [3]:
## padding left out, because padding every sentence can be inefficiant
def tokenize_fn(example):
    return tok(example['sentence1'], example['sentence2'], truncation=True)
    

In [4]:
## Speeds tokenization ###########
tok_data = data.map(tokenize_fn, batched=True)

In [5]:
tok_data

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [None]:
tok_data['train']['input_ids']

## Collate

In [None]:
## collate function: responsible for putting together samples inside a batch, passed when building a DataLoader.
## define a collate fn that will apply the correct amount of padding for the items in a dataset we want batched together
## define a collate function that will applky the correct amount of padding to the idems of a batch

from transformers import DataCollatorWithPadding, AutoTokenizer

tok = AutoTokenizer.from_pretrained("bert-base-uncased")

data_collator = DataCollatorWithPadding(tokenizer=tok)

In [87]:
## get samples we would like to batch ignore string columns
samples = tok_data['train'][:8]
samples = {k:v for k,v in samples.items() if k not in ['idx','sentence1','sentence2']}

[len(x) for x in samples['input_ids']]

[50, 59, 47, 67, 59, 50, 62, 32]

In [91]:
## check to confirm padding is ocurring correcly

collated_sample = data_collator(samples)
print(collated_sample)

{'input_ids': tensor([[  101,  2572,  3217,  5831,  5496,  2010,  2567,  1010,  3183,  2002,
          2170,  1000,  1996,  7409,  1000,  1010,  1997,  9969,  4487, 23809,
          3436,  2010,  3350,  1012,   102,  7727,  2000,  2032,  2004,  2069,
          1000,  1996,  7409,  1000,  1010,  2572,  3217,  5831,  5496,  2010,
          2567,  1997,  9969,  4487, 23809,  3436,  2010,  3350,  1012,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [  101,  9805,  3540, 11514,  2050,  3079, 11282,  2243,  1005,  1055,
          2077,  4855,  1996,  4677,  2000,  3647,  4576,  1999,  2687,  2005,
          1002,  1016,  1012,  1019,  4551,  1012,   102,  9805,  3540, 11514,
          2050,  4149, 11282,  2243,  1005,  1055,  1999,  2786,  2005,  1002,
          6353,  2509,  2454,  1998,  2853,  2009,  2000,  3647,  4576,  2005,
          1002,  1015,  1012,  1022,  4551,  1999,  2687, 

In [93]:
{k:v.shape for k,v in collated_sample.items()}

{'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67]),
 'labels': torch.Size([8])}

## Fine-tuning Trainer API

In [6]:
import os

os.environ["TRL_GRADIO_ENABLED"] = "0"      # disables Gradio UI


In [9]:
## Trainer class helps fine-tune any pretrained models with modern best practices

In [7]:
## define a TrainingArguments class to contain all the hyperparameters the Trainer will use for training/evaluation

from transformers import TrainingArguments

# help(TrainingArguments)

train_args = TrainingArguments(
    output_dir='misc/files/training_out/',
    num_train_epochs=1,
    per_device_train_batch_size=4,
    report_to="none"
)

In [8]:
## define the model

from transformers import AutoModelForSequenceClassification

classifier = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# help(Trainer)

In [None]:
tok_data.keys()

In [9]:
## Trainer doesn't work with docker instance and private network, tries to launch gradio on localhost and fails to connect.
from transformers import Trainer, DataCollatorWithPadding, AutoTokenizer


tok = AutoTokenizer.from_pretrained("bert-base-uncased")
data_collator = DataCollatorWithPadding(tokenizer=tok)

trainer = Trainer(
    model=classifier,
    args=train_args,
    train_dataset=tok_data['train'],
    eval_dataset=tok_data['validation'],
    data_collator=data_collator,    
    processing_class=tok # specifies the tokenizer to use for processing when included, by default data_collator will be DataCollatorWithPadding and can be left out
)

# from trl import SFTTrainer

# trainer = SFTTrainer(
#     model=classifier,
#     args=train_args,
#     train_dataset=tok_data['train'],
#     eval_dataset=tok_data['validation'],
#     data_collator=data_collator,
#     processing_class=tok
    
# )

In [9]:
# help(SFTTrainer)

In [11]:
## fine-tune the model
## No info on how well/badly the model performed
## Trainer wasnt provided evaluation flag by setting "eval_strategy" to either "steps" (evaluate every eval_step) or "epoch" (evaluate at the end of ea epoch)
##  Trainer wasnt provided a "compute_metrics()" function to calculate metrics during selected eval-strategy, otherwise loss would be the only thing printed.

trainer.train()



Step,Training Loss


TrainOutput(global_step=115, training_loss=0.3758869005286175, metrics={'train_runtime': 24.6572, 'train_samples_per_second': 148.76, 'train_steps_per_second': 4.664, 'total_flos': 150071968200960.0, 'train_loss': 0.3758869005286175, 'epoch': 1.0})

### compute-metrics

In [13]:
## function must take an EcalPrediction obj (named tuple w/ a predictions field and a label_ids field) and return a dictionary mapping strings to floats (string=names of metrics returned, floats=values)

sample_prediction = trainer.predict(tok_data['validation'])
print(sample_prediction.predictions.shape, sample_prediction.label_ids.shape) #LOGITS

(408, 2) (408,)


In [18]:
print(sample_prediction.metrics)

{'test_loss': 0.37257859110832214, 'test_runtime': 0.7771, 'test_samples_per_second': 525.024, 'test_steps_per_second': 9.008}


In [22]:
sample_prediction

PredictionOutput(predictions=array([[-2.313618  ,  2.3366494 ],
       [ 0.81547284, -0.8704968 ],
       [ 0.7014161 , -0.75827986],
       [-1.4260672 ,  1.4813273 ],
       [ 0.72665334, -0.8391645 ],
       [-1.5905647 ,  1.6342868 ],
       [-1.6954768 ,  1.7934313 ],
       [-2.0704958 ,  2.1199925 ],
       [-2.1379464 ,  2.1585248 ],
       [-2.4118977 ,  2.3258774 ],
       [-2.2651725 ,  2.358853  ],
       [ 0.60039145, -0.7614914 ],
       [ 1.0158762 , -0.99282056],
       [-2.1160028 ,  2.1457396 ],
       [-2.4138045 ,  2.3346686 ],
       [-1.2120703 ,  1.2329226 ],
       [-2.3607144 ,  2.3910458 ],
       [ 0.0327654 , -0.21085873],
       [-2.3052642 ,  2.4219184 ],
       [ 0.7642257 , -0.7667096 ],
       [ 0.9869936 , -1.0831897 ],
       [-0.21998173,  0.07506118],
       [ 0.2530248 , -0.3443814 ],
       [-2.2821505 ,  2.227227  ],
       [-2.0993223 ,  2.1675234 ],
       [-0.29222855,  0.07752119],
       [-0.28678158,  0.20813818],
       [-2.2870302 ,  2.27

In [24]:
import numpy as np

## take the index with the maximum value on the second axis to compate to labels

preds = np.argmax(sample_prediction.predictions, axis=-1) #PREDICITON
preds

array([1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1,

In [30]:
!pip install --quiet evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [31]:
import evaluate

## load metrics associated with MRPC dataset

metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=sample_prediction.label_ids)

{'accuracy': 0.8357843137254902, 'f1': 0.8858603066439523}

## All together 

In [1]:
import os

os.environ["TRL_GRADIO_ENABLED"] = "0"  

In [2]:
## Trainer doesn't work with docker instance and private network, tries to launch gradio on localhost and fails to connect.
from transformers import Trainer, DataCollatorWithPadding, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments
from datasets import load_dataset
import evaluate
import numpy as np

#GLUE Benchmark (10 different classification tasks) Microsoft Research Paraphrase Corpus

data = load_dataset("glue", "mrpc")

tok = AutoTokenizer.from_pretrained("bert-base-uncased")

inputs = tok(
    data['train']['sentence1'][:],
    data['train']['sentence2'][:], padding=True, truncation=True)

def tokenize_fn(example):
    return tok(example['sentence1'], example['sentence2'], truncation=True)

tok_data = data.map(tokenize_fn, batched=True)

def compute_metrics_mrpc(eval_preds):
    metric = evaluate.load("glue","mrpc")
    logits,labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions,references=labels)
    
classifier = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


data_collator = DataCollatorWithPadding(tokenizer=tok)

train_arguments = TrainingArguments(
    output_dir='misc/files/training_out/',
    num_train_epochs=1,
    per_device_train_batch_size=4,
    report_to="none"
)

trainer = Trainer(
    model=classifier,
    args=train_arguments,
    train_dataset=tok_data['train'],
    eval_dataset=tok_data['validation'],
    data_collator=data_collator,    
    processing_class=tok,
    compute_metrics=compute_metrics_mrpc
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
logits = trainer.predict(tok_data['validation'])

print(logits.metrics)



{'test_loss': 0.6493242979049683, 'test_accuracy': 0.6838235294117647, 'test_f1': 0.8122270742358079, 'test_runtime': 1.4099, 'test_samples_per_second': 289.377, 'test_steps_per_second': 4.965}


In [5]:
### mixed precision trainig

training_args = TrainingArguments(
    'misc/files/training_out/',
    eval_strategy='epoch',
    fp16=True)

In [6]:
## gradient accumulation

training_args = TrainingArguments(
    'misc/files/training_out/',
    eval_strategy='epoch',
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4) # Effective bastch size = 4*4=16

In [None]:
training_args = TrainingArguments(
    'misc/files/training_out/',
    eval_strategy='epoch',
    learning_rate=2e-5,
    lr_scheduler_type='cosine'