In [None]:
from datasets import load_dataset
datasets = load_dataset("Anthropic/hh-rlhf", data_dir="harmless-base")
datasets

In [None]:
instance = datasets["train"][0]['chosen']
# Split by both 'Human' and "Assistant"
dialogue_list = instance.split('\n\n')
dialogue_list = [dialogue.strip() for dialogue in dialogue_list if dialogue.strip() != '']
res = []
for dialogue in dialogue_list:
    print(dialogue)
    if dialogue.startswith('Human:'): 
        res.append(dialogue.lstrip('Human:').strip())
    elif dialogue.startswith('Assistant'):
        res.append(dialogue.lstrip('Assistant:').strip())
    else:
        res[-1] += '\n\n' + dialogue
        
print(res)

In [1]:
from datasets import load_dataset

datasets = load_dataset('data/Anthropic')
datasets

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['query', 'reference'],
        num_rows: 104054
    })
    test: Dataset({
        features: ['query', 'reference'],
        num_rows: 5756
    })
})

In [2]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM

config = AutoConfig.from_pretrained("microsoft/DialoGPT-small")
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small", config=config)

In [3]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples['query'], 
        padding=False, 
        truncation=True,
    )
    labels = tokenizer(
        examples['reference'], 
        padding=False, 
        truncation=True,
        
    )
    tokenized_inputs['labels'] = labels['input_ids']
    # tokenized_inputs['labels_attention_mask'] = labels['attention_mask']
    return tokenized_inputs

train_dataset = datasets['train'].map(
    tokenize_and_align_labels,
    batched=True,
).remove_columns(datasets['train'].column_names)
# sample  300 examples
train_dataset = train_dataset.select(range(300))

train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 300
})

In [4]:
from torch.utils.data import DataLoader
from transformers import DataCollatorForSeq2Seq

label_pad_token_id = tokenizer.pad_token_id
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=None,
)
train_dataloader = DataLoader(
    train_dataset,
    batch_size=3,
    collate_fn=data_collator,
    shuffle=True,
)
# inputs = next(iter(train_dataloader))
# print(inputs)

2024-02-09 10:28:47.002895: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-09 10:28:47.150089: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-02-09 10:28:47.150122: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2024-02-09 10:28:47.833078: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-

In [8]:
import sys 
sys.dont_write_bytecode = True
from models import _prepare_decoding_inputs
from transformers import AdamW, get_scheduler

optimizer = AdamW(model.parameters(), lr=1e-4)
scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=100,
)
new_inputs = _prepare_decoding_inputs(inputs)

# Forward pass for CLM
outputs = model(**new_inputs)
loss = outputs.loss
print('decoding loss: ', loss)
loss.backward()

# Update optimizer and scheduler
optimizer.step()

# Update learning rate
scheduler.step()

# Clear gradients
optimizer.zero_grad()

# Another forward pass for CLM
outputs = model(**new_inputs)
loss = outputs.loss
print('decoding loss: ', loss)



decoding loss:  tensor(4.9014, grad_fn=<NllLossBackward0>)
decoding loss:  tensor(2.4446, grad_fn=<NllLossBackward0>)


In [7]:
import torch
from typing import Iterable
from models import get_stages, stages_decoding
from collections import defaultdict
import sys 
sys.dont_write_bytecode = True
from transformers import get_scheduler

stages = get_stages(
    config=config,
    token=None,
    model_name_or_path="microsoft/DialoGPT-small",
    num_stages=4,
    init_device=0,
    timing_info=defaultdict(list),
)
# stages is a list of GPT2LMHeadModel
# This time: define optimizer and scheduler in this case
# Collect all parameters from each stage
all_parameters = []
for stage in stages:
    all_parameters.extend(list(stage.parameters()))
optimizer = torch.optim.AdamW(all_parameters, lr=1e-4)
    
schedulers = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=100,
)
losses = []

@torch.no_grad()
def custom_step(lr: float):
    """
    Applies a simple gradient descent update to the parameters.

    Args:
        parameters (Iterable[torch.nn.Parameter]): An iterable of Parameters to update.
        lr (float): The learning rate to use for the update.
    """
    # with torch.no_grad():  # Ensure gradients are not tracked in this operation
    #     for param in parameters:
    #         if param.grad is not None:  # Skip parameters without gradients
    #             param -= lr * param.grad  # Update parameter using gradient descent
    for stage in stages:
        for param in stage.parameters():
            if param.grad is not None:
                # Manually update the model parameters using the gradient descent rule
                # param.data = param.data - learning_rate * param.grad
                param -= lr * param.grad

# Let's use 100 steps for demonstration
for inputs in train_dataloader:
    inputs = next(iter(train_dataloader))
    outputs = stages_decoding(stages, inputs)
    loss = outputs[0]
    losses.append(loss.item())
    print('decoding loss: ', loss)
    loss.backward()

    # Update optimizer and scheduler
    # print(optimizer.state_dict())
    optimizer.step()
    # Using custom step
    # custom_step(schedulers.get_last_lr()[0])

    # Update learning rate
    schedulers.step()

    # Clear gradients
    optimizer.zero_grad()

print('average loss: ', sum(losses) / len(losses))


Put stage GPTStartingStage (60647424 parameters) on device 0
Put stage GPTIntermediateStage (21263616 parameters) on device 1
Put stage GPTIntermediateStage (21263616 parameters) on device 2
Put stage GPTEndingStage (59862528 parameters) on device 3
Forward pass for stage 0 on device 0
Forward pass for stage 1 on device 1
Forward pass for stage 2 on device 2
Forward pass for stage 3 on device 3
decoding loss:  tensor(6.1026, device='cuda:3', grad_fn=<BackwardHookFunctionBackward>)
Forward pass for stage 0 on device 0
Forward pass for stage 1 on device 1
Forward pass for stage 2 on device 2
Forward pass for stage 3 on device 3
decoding loss:  tensor(4.0280, device='cuda:3', grad_fn=<BackwardHookFunctionBackward>)
Forward pass for stage 0 on device 0
Forward pass for stage 1 on device 1
Forward pass for stage 2 on device 2
Forward pass for stage 3 on device 3
decoding loss:  tensor(5.0473, device='cuda:3', grad_fn=<BackwardHookFunctionBackward>)
Forward pass for stage 0 on device 0
Forwa

In [14]:
# optimizer.state_dict()