In [2]:
# Installation of dependencies:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets
!pip install -q wandb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m75.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m62.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m4.0 MB/

In [3]:
# Loading of the model, which in this case, will be GPT2-xl.
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

"""
Using bfloat16 instead of fp16 since it consumes less memory and is faster to download.
The model will subsequently be loaded in 4bit precision.
Normal-float 4bit (and not default fp4) will be used for better optimization.
Usage of double quantization, and bfloat16 as the compute type, since computation is
still occuring in higher precision.
"""
model_path = "crumbly/gpt2-linear-xl-sharded-bf16"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map={"":0}, quantization_config=bnb_config, trust_remote_code=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

Downloading (…)nfiguration_gpt2l.py:   0%|          | 0.00/12.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/crumbly/gpt2-linear-xl:
- configuration_gpt2l.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)in/modeling_gpt2l.py:   0%|          | 0.00/42.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/crumbly/gpt2-linear-xl:
- modeling_gpt2l.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)model.bin.index.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00003.bin:   0%|          | 0.00/1.06G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00003.bin:   0%|          | 0.00/1.06G [00:00<?, ?B/s]

Downloading (…)l-00003-of-00003.bin:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

In [4]:
# Generation, to see if the model was loaded in correctly.
inputs = {k:v.cuda() for k, v in tokenizer("Once upon a time,", return_tensors='pt').items()}
outputs = model.generate(**inputs, max_new_tokens=32, temperature=0.7, do_sample=True)
tokenizer.decode(outputs[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Once upon a time, there was a large and varied galaxy, with many different star systems, inhabited planets, and even other galaxies.\n\nThis galaxy was a lot more complex,'

In [5]:
# this isn't supported yet with the GPT2 model we use, but for other models:
# uncomment these lines and run them
# from peft import prepare_model_for_kbit_training
# model.gradient_checkpointing_enable()
# model = prepare_model_for_kbit_training(model)

In [6]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [7]:
# Lora configutation for finetuning.
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    # ReLoRA uses r=128 by default in their code, but r=1 will even work to a degree
    r=8,
    lora_alpha=32,
    # c_attn is our qkv
    target_modules=["c_attn"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 2457600 || all params: 822788800 || trainable%: 0.2986914746530337


In [8]:
# Lets now download the Open Orca dataset, and finetune our model on the set.
# New line to delimit between the system prompt, question, and response for simplicity.
from datasets import load_dataset

# Lets set streaming=True, to stream examples over the internet, rather than downloading the
# entire dataset for processing.
data = load_dataset("Open-Orca/OpenOrca", streaming=True)

# Removal of trailing spaces or newlines from the prompts in the dataset.
def strip(batch):
  return [i.strip() for i in list(batch)]

# Process function that takes a batch from the dataset as input, use strip function on the
# batch keys that correspond to the system prompt, questions and responses. It then zip
# them into a tuple, join the tuples with newlines, then strips training spaces.
def process(batch):
  systems = [i for i in strip(batch['system_prompt'])]
  questions = [i for i in strip(batch['question'])]
  responses = [i for i in strip(batch['response'])]
  prompts = zip(systems, questions, responses)
  prompts = ["\n".join(i) for i in prompts]
  prompts = strip(prompts)
  return prompts

# Set the max length to a value thats lower than normal, so its not out of memory.
tokenizer.model_max_length = 768
data = data.map(lambda samples: tokenizer(process(samples), truncation=True), batched=True)

Downloading readme:   0%|          | 0.00/9.37k [00:00<?, ?B/s]

In [9]:
# Training code.
import transformers

tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=data['train'],
    args=transformers.TrainingArguments(
        # Your 'effective batch size' is the product of these two values:
        per_device_train_batch_size = 1,
        gradient_accumulation_steps=8,

        # You can count the examples you are going to train on by multiplying
        # max_steps by your effective batch size. In this example, will train
        # 512 examples.
        max_steps=64,
        warmup_steps=16,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=4,
        output_dir="outputs",
        optim="paged_adamw_8bit",

        # If you want to log the loss graph to your wandb, change "none" to "wandb".
        report_to="none"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

# Silence the warnings. Please re-enable for inference!
model.config.use_cache=False
trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
4,2.8494
8,2.5073
12,2.743
16,2.5348
20,2.8063
24,2.6181
28,2.5188
32,2.5385
36,2.4373
40,2.3642


TrainOutput(global_step=64, training_loss=2.499387666583061, metrics={'train_runtime': 295.7301, 'train_samples_per_second': 1.731, 'train_steps_per_second': 0.216, 'total_flos': 802220553600000.0, 'train_loss': 2.499387666583061, 'epoch': 1.0})

In [15]:
prompt = """Classify the text into neutral, negative, or positive.

Text: I think the food was okay.

Sentiment:"""

inputs = {k:v.cuda() for k, v in tokenizer("prompt", return_tensors='pt').items()}
outputs = model.generate(**inputs, max_new_tokens=32, temperature=0.7, do_sample=True)
tokenizer.decode(outputs[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'prompt:\n\nA story or a series of events, usually involving a main character, that explain or explain away some phenomenon.\n\nThe story of a plane'