# Setup

### Install System Dependencies

In [1]:
!apt-get update

!apt-get install -y --no-install-recommends cuda-toolkit-11-8

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Hit:6 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Fetched 229 kB in 2s (138 kB/s)
Reading package lists... Done
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
cuda-toolkit-11-8 is already the newest version (11.8.0-1).
0

### Mount Google Drive

In [2]:
from google.colab import drive

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Install Python Dependencies

In [3]:
!python3 -m pip install \
    --upgrade \
    --requirement "/content/drive/MyDrive/2024-01-26/requirements.fine_tuning.txt" \
    --constraint "/content/drive/MyDrive/2024-01-26/constraints.fine_tuning.txt" \
    --extra-index-url "https://download.pytorch.org/whl/cu118"

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu118


### CUDA Configuration

In [4]:
import os

os.environ["LD_LIBRARY_PATH"] += ":/usr/local/cuda-11/lib64"
os.environ["LD_LIBRARY_PATH"] += ":/usr/local/cuda-11.8/lib64"
os.environ["LD_LIBRARY_PATH"] += ":/usr/lib64-nvidia"

# Import Python Packages

In [5]:
import gc
import json
import pathlib
import shutil
import time

import datasets
import peft
import torch
import transformers
import trl

# Create Dataset Splits

In [6]:
raw_dataset_path = pathlib.Path("/content/drive/MyDrive/2024-01-26/json_documents.json")

hugging_face_dataset_path = pathlib.Path("hugging_face_dataset_directory")
hugging_face_dataset_archive_name = "hugging_face_dataset_archive"

In [7]:
with raw_dataset_path.open(encoding="utf-8") as file_object:
    raw_dataset = json.load(file_object)

In [8]:
dataset_splits = {split_type: [] for split_type in ["train", "validation", "test"]}

for document in raw_dataset["tuning_documents"]:
    dataset_splits[document["split"]].append(document)

In [9]:
hugging_face_dataset = datasets.DatasetDict(
    {
        split_type: datasets.Dataset.from_list(split_data)
        for split_type, split_data in dataset_splits.items()
    }
)

### Backup Dataset

In [10]:
hugging_face_dataset.save_to_disk(hugging_face_dataset_path)

Saving the dataset (0/1 shards):   0%|          | 0/1703 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/577 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/574 [00:00<?, ? examples/s]

In [11]:
_ = shutil.make_archive(
    hugging_face_dataset_archive_name, "zip", base_dir=hugging_face_dataset_path
)

### Cleanup

In [12]:
del hugging_face_dataset
del dataset_splits
del raw_dataset

In [13]:
_ = gc.collect()

time.sleep(30)

_ = gc.collect()

# Tune Pretrained Model

In [14]:
base_model_identifier = "facebook/opt-350m"

tuning_checkpoint_directory_path = pathlib.Path("opt_tuning_checkpoints_directory")
tuning_checkpoint_archive_name = "opt_tuning_checkpoints_archive"

tuned_adapter_directory_path = pathlib.Path("tuned_opt_adapter_directory")
tuned_adapter_archive_name = "tuned_opt_adapter_archive"

In [15]:
hugging_face_dataset = datasets.load_from_disk(hugging_face_dataset_path)

training_subset = hugging_face_dataset["train"]
validation_subset = hugging_face_dataset["validation"]

In [16]:
quantisation_configuration = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

In [17]:
model = transformers.AutoModelForCausalLM.from_pretrained(
    base_model_identifier,
    quantization_config=quantisation_configuration,
    device_map="auto",
)

model.config.use_cache = False
model.config.pretraining_tp = 1

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  return self.fget.__get__(instance, owner)()


In [18]:
tokeniser = transformers.AutoTokenizer.from_pretrained(base_model_identifier)

tokeniser.pad_token = tokeniser.eos_token
tokeniser.padding_side = "right"

In [19]:
peft_configuration = peft.LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="lora_only",
    task_type=peft.TaskType.CAUSAL_LM,
)

peft_model = peft.get_peft_model(model, peft_configuration)

In [20]:
training_configuration = transformers.TrainingArguments(
    output_dir=str(tuning_checkpoint_directory_path),
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    gradient_accumulation_steps=1,
    learning_rate=1e-3,
    weight_decay=0.001,
    max_grad_norm=0.3,
    num_train_epochs=20,
    max_steps=-1,
    lr_scheduler_type=transformers.SchedulerType.REDUCE_ON_PLATEAU,
    warmup_ratio=0.03,
    save_strategy="epoch",
    save_safetensors=True,
    use_cpu=False,
    seed=0,
    data_seed=0,
    bf16=False,
    fp16=True,
    load_best_model_at_end=True,
    optim="paged_adamw_32bit",
    group_by_length=True,
    report_to=["none"],
    auto_find_batch_size=True,
)

In [21]:
supervised_trainer = trl.SFTTrainer(
    model=peft_model,
    args=training_configuration,
    train_dataset=training_subset,
    eval_dataset=validation_subset,
    tokenizer=tokeniser,
    dataset_text_field="instruction_without_context",
    packing=False,
    max_seq_length=512,
)

Map:   0%|          | 0/1703 [00:00<?, ? examples/s]

Map:   0%|          | 0/577 [00:00<?, ? examples/s]

In [22]:
supervised_trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.885279
2,No log,0.658896
3,0.980700,0.507731
4,0.980700,0.447947
5,0.504000,0.406625
6,0.504000,0.37667
7,0.504000,0.368012
8,0.395100,0.335787
9,0.395100,0.315037
10,0.335000,0.299525


TrainOutput(global_step=4260, training_loss=0.41160730890265096, metrics={'train_runtime': 1228.8467, 'train_samples_per_second': 27.717, 'train_steps_per_second': 3.467, 'total_flos': 4570919643660288.0, 'train_loss': 0.41160730890265096, 'epoch': 20.0})

### Backup Model

In [23]:
supervised_trainer.model.save_pretrained(tuned_adapter_directory_path, safe_serialization=True)

In [24]:
_ = shutil.make_archive(tuned_adapter_archive_name, "zip", base_dir=tuned_adapter_directory_path)
_ = shutil.make_archive(
    tuning_checkpoint_archive_name, "zip", base_dir=tuning_checkpoint_directory_path
)

### Cleanup

In [25]:
del supervised_trainer
del training_configuration
del peft_model
del tokeniser
del model
del validation_subset
del training_subset
del hugging_face_dataset

In [26]:
_ = gc.collect()
torch.cuda.empty_cache()

time.sleep(30)

_ = gc.collect()
torch.cuda.empty_cache()

# Use Untuned Model

In [27]:
prompt = "Can you name the modules in the root package?"

In [28]:
untuned_model = transformers.AutoModelForCausalLM.from_pretrained(
    base_model_identifier,
    quantization_config=quantisation_configuration,
    device_map="auto",
)

In [29]:
tokeniser = transformers.AutoTokenizer.from_pretrained(base_model_identifier)

tokeniser.pad_token = tokeniser.eos_token
tokeniser.padding_side = "right"

In [30]:
untuned_pipeline = transformers.pipeline(
    "text-generation",
    model=untuned_model,
    tokenizer=tokeniser,
    device_map="auto",
    torch_dtype=torch.float16,
    model_kwargs={"low_cpu_mem_usage": True},
    max_new_tokens=256,
    do_sample=True,
    top_k=1,
)

In [31]:
untuned_response = untuned_pipeline(prompt)

print(untuned_response[0]["generated_text"])

Can you name the modules in the root package?
I can't, but I can tell you that the modules are in the root package.
I'm not sure what you mean by root package.
The package is the package that contains the modules.
Oh, I see. I'm not sure what you mean by the package.
The package is the package that contains the modules.
Oh, I see. I'm not sure what you mean by the package.
The package is the package that contains the modules.
Oh, I see. I'm not sure what you mean by the package.
The package is the package that contains the modules.
Oh, I see. I'm not sure what you mean by the package.
The package is the package that contains the modules.
Oh, I see. I'm not sure what you mean by the package.
The package is the package that contains the modules.
Oh, I see. I'm not sure what you mean by the package.
The package is the package that contains the modules.
Oh, I see. I'm not sure what you mean by the package.
The package is the package that contains the modules.
Oh, I see. I'm not sure what y

### Cleanup

In [32]:
del untuned_pipeline
del tokeniser
del untuned_model

In [33]:
_ = gc.collect()
torch.cuda.empty_cache()

time.sleep(30)

_ = gc.collect()
torch.cuda.empty_cache()

# Use Tuned Model

In [34]:
untuned_model = transformers.AutoModelForCausalLM.from_pretrained(
    base_model_identifier,
    quantization_config=quantisation_configuration,
    device_map="auto",
)

In [35]:
tokeniser = transformers.AutoTokenizer.from_pretrained(base_model_identifier)

tokeniser.pad_token = tokeniser.eos_token
tokeniser.padding_side = "right"

In [36]:
tuned_model = peft.PeftModel.from_pretrained(untuned_model, tuned_adapter_directory_path)

In [37]:
tuned_pipeline = transformers.pipeline(
    "text-generation",
    model=tuned_model,
    tokenizer=tokeniser,
    device_map="auto",
    torch_dtype=torch.float16,
    model_kwargs={"low_cpu_mem_usage": True},
    max_new_tokens=256,
    do_sample=True,
    top_k=1,
)

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PersimmonF

In [38]:
tuned_response = tuned_pipeline(prompt)

print(tuned_response[0]["generated_text"])

Can you name the modules in the root package?

Yes, the root package is 'package_name_to_import_with'.

What is the parent package?

The root package is 'package_name_to_import_with'.

What is the parent package?

The root package is 'package_name_to_import_with'.

What is the parent package?

The root package is 'package_name_to_import_with'.

What is the parent package?

The root package is 'package_name_to_import_with'.

What is the parent package?

The parent package is 'package_name_to_import_with'.

What is the parent package?

The parent package is 'package_name_to_import_with'.

What is the parent package?

The root package is 'package'.

What is the parent package?

The root package is 'package'.

What is the parent package?

The root package.

What is the parent package?

The parent package.

What is the parent package?

The root package.

What is the parent package?

What


### Cleanup

In [39]:
del tuned_pipeline
del tuned_model
del tokeniser
del untuned_model

In [40]:
_ = gc.collect()
torch.cuda.empty_cache()

time.sleep(30)

_ = gc.collect()
torch.cuda.empty_cache()