In [1]:
%%capture
%pip install -U transformers 
%pip install -U datasets 
%pip install -U accelerate 
%pip install -U peft 
%pip install -U trl 
%pip install -U bitsandbytes 
%pip install -U wandb

In [2]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

2024-07-14 18:53:34.099804: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-14 18:53:34.099921: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-14 18:53:34.263190: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("HUGGINGFACE_TOKEN")

login(token = hf_token)

wb_token = user_secrets.get_secret("wandb")

wandb.login(key=wb_token)
run = wandb.init(
    project='Fine-tune Llama 3 8B on trial Dataset', 
    job_type="training", 
    anonymous="allow"
)
#Output redacted

In [4]:
base_model = "meta-llama/Meta-Llama-3-8B-Instruct"
dataset_name = "/kaggle/input/pjgpt-data"
new_model = "..." # Redacted

In [5]:
torch_dtype = torch.float16
attn_implementation = "eager"

In [6]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

In [7]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model)
model, tokenizer = setup_chat_format(model, tokenizer)

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)

In [31]:
dataset = load_dataset(dataset_name, split="all")[1:-1]

In [9]:
import json

In [10]:
# Load the list of dictionaries from the JSON file
with open('/kaggle/input/pjpktxt/pjtxt.json', 'r', encoding='utf-8') as json_file:
    pj = json.load(json_file)
    
with open('/kaggle/input/pjpktxt/pktxt.json', 'r', encoding='utf-8') as json_file:
    pk = json.load(json_file)

In [12]:
dataset_final=[]
for i in range(len(pj)):
    raww=[{"role": "user", "content": pj[i]},
           {"role": "assistant", "content": pk[i]}]
    dataset_final.append(tokenizer.apply_chat_template(raww, tokenize=False))
    if(i%1001==0):
        print(i)

0
1001
2002
3003
4004
5005
6006
7007
8008
9009
10010
11011
12012
13013
14014
15015
16016
17017
18018
19019
20020
21021
22022
23023
24024
25025
26026


In [14]:
len(dataset_final) # 100000 Messages combined so multiple responses considered one as long as other doesn't reply

26410

In [70]:
from sklearn.model_selection import train_test_split

# Split the list into training and testing sets
train, test = train_test_split(dataset_final, test_size=0.1)

In [72]:
len(train)

23769

In [15]:
dff=[]
for i in range(len(dataset_final)):
    dff.append({'text':dataset_final[i]})

In [16]:
from datasets import Dataset

dataset_f = Dataset.from_list(dff)

In [17]:
dataset_f

Dataset({
    features: ['text'],
    num_rows: 26410
})

In [18]:
dataset_f = dataset_f.train_test_split(test_size=0.01)

In [20]:
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    evaluation_strategy="steps",
    eval_steps=0.05,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb",
    save_steps=1000,  # Save less frequently
    save_total_limit=3  # Keep only the 3 most recent checkpoints
)



In [21]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_f["train"],
    eval_dataset=dataset_f["test"],
    peft_config=peft_config,
    max_seq_length=512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/26145 [00:00<?, ? examples/s]

Map:   0%|          | 0/265 [00:00<?, ? examples/s]

In [22]:
trainer.train()

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Step,Training Loss,Validation Loss
654,11.5543,11.037816
1308,3.6128,2.923364
1962,3.3452,2.849795
2616,4.0106,2.832273
3270,3.7098,2.790813
3924,2.8467,2.750839
4578,3.1236,2.715968
5232,2.872,2.696605
5886,2.8324,2.662241
6540,1.6021,2.655752




TrainOutput(global_step=13072, training_loss=2.7019014160993486, metrics={'train_runtime': 15729.1112, 'train_samples_per_second': 1.662, 'train_steps_per_second': 0.831, 'total_flos': 3.894722285278003e+16, 'train_loss': 2.7019014160993486, 'epoch': 0.9999617517689807})

In [None]:
wandb.finish()
model.config.use_cache = True

#Output redacted

In [25]:
trainer.model.save_pretrained(new_model)


