# Setup

In [None]:
# additional google colab setup
import sys


def colab_install():
    %pip install transformers
    %pip install datasets
    %pip install peft
    %pip install bitsandbytes
    %pip install torch
    return


if "google.colab" in sys.modules:
    print("Running in Google Colab")
    # Install required packages
    colab_install()
else:
    print("Not running in Google Colab")

In [None]:
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, TaskType, prepare_model_for_kbit_training, get_peft_model
import torch

transformers.set_seed(24)

# Prepare dataset

In [None]:
# TODO implement survay datasets
from datasets import load_dataset

dataset = load_dataset("stanfordnlp/imdb", split="train")
dataset

# Loading the model

In [None]:
model_id = "EleutherAI/pythia-70m"

# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    padding_side="left",
    trust_remote_code=True,
)
if getattr(tokenizer, "pad_token_id") is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id


dataset = dataset.map(
    lambda x: tokenizer(x["text"], truncation=True, padding="max_length"), batched=True
)

dataset = dataset.remove_columns(["text", "label"])
dataset

In [None]:
# load model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map="auto",
)

model = prepare_model_for_kbit_training(model)

if getattr(model.config, "pad_token_id") is None:
    model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
lora_rank = 8
lora_alpha = 8

lora_config = LoraConfig(
    r=lora_rank,
    lora_alpha=lora_alpha,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules="all-linear",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
model.config.use_cache = False

In [None]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=dataset,
    args=transformers.TrainingArguments(
        output_dir="./results",
        gradient_checkpointing=True,
        gradient_checkpointing_kwargs={"use_reentrant": False},
        fp16=True,
        optim="paged_adamw_8bit",
        report_to="none",
    ),
    tokenizer=tokenizer,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

trainer.train()