In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7


In [3]:
import os, torch, logging
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
from huggingface_hub import login

login("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")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
# Dataset
dataset_path = "greengerong/leetcode"
# Model and tokenizer names
llama_model_path = "meta-llama/Llama-2-7b-chat-hf"
# Save directory
save_dir = "/content/drive/MyDrive/llama_2_code_generate/"

In [5]:
def preprocess(sample):
  sample["text"] = '<s>[INST] ' + sample["content"] + " [/INST] " + sample["c++"] + " </s>"
  return sample

dataset = load_dataset(dataset_path, split="train")

full_dataset = dataset.shuffle(seed = 77)
full_dataset = full_dataset.map(preprocess).remove_columns(['id', 'slug', 'title', 'difficulty', 'content', 'java', 'c++', 'python', 'javascript'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(llama_model_path, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"  # Fix for fp16
# Quantization Config
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)
# Model
model = AutoModelForCausalLM.from_pretrained(
    llama_model_path,
    quantization_config=quantization_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
# LoRA Config
lora_parameters = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="CAUSAL_LM"
)
# Training Params
train_params = TrainingArguments(
    output_dir= save_dir + "result",
    num_train_epochs=1,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    max_grad_norm=0.35,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    per_device_train_batch_size=4
)
# Trainer
fine_tuning = SFTTrainer(
    model = model,
    train_dataset = full_dataset,
    peft_config = lora_parameters,
    dataset_text_field = "text",
    tokenizer = tokenizer,
    args = train_params
)




In [8]:
# Training
fine_tuning.train()



You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,1.0702
50,0.8559
75,0.8043
100,0.7821
125,0.7721
150,0.7697
175,0.7672
200,0.7446
225,0.7459
250,0.7476




Step,Training Loss
25,1.0702
50,0.8559
75,0.8043
100,0.7821
125,0.7721
150,0.7697
175,0.7672
200,0.7446
225,0.7459
250,0.7476




TrainOutput(global_step=590, training_loss=0.7593496274139921, metrics={'train_runtime': 6870.2872, 'train_samples_per_second': 0.344, 'train_steps_per_second': 0.086, 'total_flos': 3.924414643588301e+16, 'train_loss': 0.7593496274139921, 'epoch': 1.0})

In [9]:
# Save Model
fine_tuning.model.save_pretrained(save_dir+"model")
