In [22]:
%pip install accelerate==0.26.1
%pip install bitsandbytes==0.42.0
%pip install datasets==2.16.1
%pip install peft==0.8.1
%pip install transformers==4.37.2
%pip install einops==0.7.0
%pip install torch==2.1.0
%pip uninstall -y transformers && pip install git+https://github.com/huggingface/transformers

Found existing installation: transformers 4.37.2
Uninstalling transformers-4.37.2:
  Successfully uninstalled transformers-4.37.2
Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-bmhfw10b
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-bmhfw10b
  Resolved https://github.com/huggingface/transformers to commit 8c12690cecbb97e187861e386f7a0ac790e4236c
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tokenizers<0.20,>=0.19 (from transformers==4.41.0.dev0)
  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packag

In [23]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import torch
# Load model
modelpath = "microsoft/phi-2"
model = AutoModelForCausalLM.from_pretrained(
    modelpath,
    device_map="auto",
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4",
    ),
    torch_dtype=torch.bfloat16,
    # FA2 does not work yet
    # attn_implementation="flash_attention_2",
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [24]:
from transformers import AutoTokenizer

# fast tokenizer sometimes ignores the added tokens
tokenizer = AutoTokenizer.from_pretrained(modelpath, use_fast=False)

# add special tokens for ChatML formatting and a pad token
tokenizer.add_tokens(["<|im_start|>", "<PAD>"])
tokenizer.pad_token = "<PAD>"
tokenizer.add_special_tokens(dict(eos_token="<|im_end|>"))
model.config.eos_token_id = tokenizer.eos_token_id

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [25]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

# Adapter settings
lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules = [ "q_proj", "k_proj", "v_proj", "dense" ],
    modules_to_save = ["lm_head", "embed_tokens"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)

model.config.use_cache = False

from datasets import load_dataset

# load the dataset created in Part 1
dataset = load_dataset("g-ronimo/riddles_evolved")

# split into training (90%) and test set (10%)
dataset = dataset["train"].train_test_split(test_size=0.1)

In [26]:
import pandas as pd
posts_df = pd.read_csv('reddit.csv')

posts_df

Unnamed: 0,title,body,comments,q,a
0,Ford And GM Are Running Back To Trucks As EV D...,,"[""Ah yes, we all remember when GM and Ford lef...",Ford And GM Are Running Back To Trucks As EV D...,"Ah yes, we all remember when GM and Ford left ..."
1,Toyota's Hydrogen Future Is Crumbling As Owner...,,"[""Wow I never knew how bad the fuel problem ha...",Toyota's Hydrogen Future Is Crumbling As Owner...,Wow I never knew how bad the fuel problem had ...
2,F1 Drivers Surprised by China Track That's Bee...,,"[""First world power with third world practical...",F1 Drivers Surprised by China Track That's Bee...,First world power with third world practicalit...
3,All the Incredible Cars Destroyed in Cash for ...,**Link:** https://www.youtube.com/watch?v=BUWj...,"[""times were tough. I hade to trade is my gas ...",All the Incredible Cars Destroyed in Cash for ...,times were tough. I hade to trade is my gas gu...
4,Have I lost my mind? I kind of want a Camry as...,I just watched SavageGeese's video review on i...,"[""“I want one of the most common daily drivers...",Have I lost my mind? I kind of want a Camry as...,“I want one of the most common daily drivers a...
...,...,...,...,...,...
960,Do you think we have been spoiled by modern cars?,So I find myself at an interesting point in my...,"[""In terms of sheer power, absolutely, we're s...",Do you think we have been spoiled by modern ca...,"In terms of sheer power, absolutely, we're spo..."
961,Hyundai’s Genesis brand is a dark horse in U.S...,,"[""Until Hyundai, Kia, and Genesis start enforc...",Hyundai’s Genesis brand is a dark horse in U.S...,"Until Hyundai, Kia, and Genesis start enforcin..."
962,"Now that Toyota has LC300/LX, Prado/GX, Tundra...",I think Toyota is the only manufacturer where ...,"[""Probably update them at some point? They’ve ...","Now that Toyota has LC300/LX, Prado/GX, Tundra...",Probably update them at some point? They’ve al...
963,Toyota USA museum is crushing a mint condition...,,"[""Article mentions potential liability reasons...",Toyota USA museum is crushing a mint condition...,Article mentions potential liability reasons t...


In [27]:
from sklearn.model_selection import train_test_split
train_df,test_df = train_test_split(posts_df,test_size=0.2,random_state=42)

In [28]:
import datasets
dataset_dict = datasets.DatasetDict()
dataset_dict['train'] = datasets.Dataset.from_pandas(train_df)
dataset_dict['test'] = datasets.Dataset.from_pandas(test_df)
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['title', 'body', 'comments', 'q', 'a', '__index_level_0__'],
        num_rows: 772
    })
    test: Dataset({
        features: ['title', 'body', 'comments', 'q', 'a', '__index_level_0__'],
        num_rows: 193
    })
})

In [29]:
import os
from functools import partial

# ChatML format
templates = [
    "<|im_start|>assistant\n{msg}<|im_end|>",      # message by assistant
    "<|im_start|>user\n{msg}<|im_end|>"           # message by user
]

# This special index is used to ignore certain tokens during loss calculation.
IGNORE_INDEX = -100

def tokenize(input, max_length):
    input_ids, attention_mask, labels, labels_mask = [], [], [],[]

    # Iterate over each message in the dataset
    for i, (msg,msg_a) in enumerate(zip(input['q'],input['a'])):

        # Check if the message is from human (user) or assistant, apply ChatML template
        isHuman = i%2==0
        msg_chatml = templates[isHuman].format(msg=msg)
        msg_chatml_a = templates[isHuman].format(msg=msg_a)

        # tokenize all, truncate later
        msg_tokenized = tokenizer(
          msg_chatml,
          truncation=False,
          add_special_tokens=False)
        msg_tokenized_a = tokenizer(
          msg_chatml_a,
          truncation=False,
          add_special_tokens=False)
        # Copy tokens and attention mask without changes
        input_ids += msg_tokenized["input_ids"]
        attention_mask += msg_tokenized["attention_mask"]

        # Adapt labels for loss calculation: if user->IGNORE_INDEX, if assistant->input_ids  (=ignore human messages, calculate loss only for assistant messages since these are the reponses we want to learn)
        # labels += [IGNORE_INDEX]*len(msg_tokenized["input_ids"]) if isHuman else msg_tokenized["input_ids"]
        labels += msg_tokenized_a['input_ids']
        labels_mask += msg_tokenized_a['attention_mask']

    # truncate to max. length
    return {
        "input_ids": input_ids[:max_length],
        "attention_mask": attention_mask[:max_length],
        "labels": labels[:max_length],
        'labels_mask':labels_mask[:max_length]
    }

dataset_tokenized = dataset_dict.map(
    # cut samples at 1024 tokens
    # enough for the riddles dataset (max. length ~1000 tokens)
    # has to be adapted for other datasets, higher = more VRAM needed
    partial(tokenize, max_length=1024),
    batched = False,
    num_proc = os.cpu_count(),    # multithreaded
    # remove_columns = dataset_dict["train"].column_names  # Remove original columns, no longer needed
)

  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/772 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/193 [00:00<?, ? examples/s]

In [30]:
# collate function - to transform list of dictionaries [ {input_ids: [123, ..]}, {.. ] to a single dictionary forming a batch { input_ids: [..], labels: [..], attention_mask: [..] }
def collate(elements):

    # Extract input_ids from each element and find the maximum length among them
    tokens = [e["input_ids"] for e in elements]
    tokens_maxlen = max([len(t) for t in tokens])

    for i, sample in enumerate(elements):
        input_ids = sample["input_ids"]
        labels = sample["labels"]
        attention_mask = sample["attention_mask"]

        # Calculate the padding length required to match the maximum token length
        pad_len = tokens_maxlen-len(input_ids)

        # Pad 'input_ids' with the pad token ID, 'labels' with IGNORE_INDEX, and 'attention_mask' with 0
        input_ids.extend( pad_len * [tokenizer.pad_token_id] )
        labels.extend( pad_len * [IGNORE_INDEX] )
        attention_mask.extend( pad_len * [0] )

    # create and return batch with all the data in elements
    batch={
        "input_ids": torch.tensor( [e["input_ids"] for e in elements] ),
        "labels": torch.tensor( [e["labels"] for e in elements] ),
        "attention_mask": torch.tensor( [e["attention_mask"] for e in elements] ),
    }
    return batch

In [31]:
from transformers import TrainingArguments, Trainer

bs=1         # batch size
ga_steps=16  # gradient acc. steps
epochs=20
lr=0.00002

steps_per_epoch=len(dataset_tokenized["train"])//(bs*ga_steps)

args = TrainingArguments(
    output_dir="out",
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=16,
    evaluation_strategy="steps",
    logging_steps=1,
    eval_steps=steps_per_epoch//2,      # eval twice per epoch
    save_steps=steps_per_epoch,         # save once per epoch
    gradient_accumulation_steps=ga_steps,
    num_train_epochs=epochs,
    lr_scheduler_type="constant",
    optim="paged_adamw_32bit",      # val_loss will go NaN with paged_adamw_8bit
    learning_rate=lr,
    group_by_length=False,
    bf16=True,
    ddp_find_unused_parameters=False,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=collate,
    train_dataset=dataset_tokenized["train"],
    eval_dataset=dataset_tokenized["test"],
)

trainer.train()

ValueError: Your setup doesn't support bf16/gpu. You need torch>=1.10, using Ampere GPU with cuda>=11.0

In [None]:
torch.__version__