In [1]:
import pandas as pd
import json

In [2]:
conv_dataframe = pd.read_csv("../data/all-conv-galaxy-q-a.csv", sep="\t")
# all-conv-galaxy-q-a.csv # conversations-galaxy-q-a.csv
conv_dataframe

Unnamed: 0,conversations,tensor_size
0,\n[INST]\nI have a very basic notebook running...,349
1,\n[INST]\nAny ideas yet what is going on with ...,405
2,"\n[INST]\nyes I much prefer disucssing here, t...",475
3,"\n[INST]\nthink so. Where would I set that, an...",524
4,"\n[INST]\nThanks a lot for getting back to me,...",473
...,...,...
1984,"\n[INST]\nhello, . i am working with candida g...",175
1985,\n[INST]\nI have been trying to create a accou...,85
1986,\n[INST]\nI am trying to follow this transcrip...,172
1987,\n[INST]\nTwo questions Does anyone know wheth...,242


In [3]:
len(conv_dataframe)

1989

In [4]:
from datasets import load_dataset
from datasets import Dataset

tr_index = 1800
final_index = len(conv_dataframe)
tr_conv = conv_dataframe[:tr_index]
eval_conv = conv_dataframe[tr_index + 1: final_index]
dataset = Dataset.from_pandas(tr_conv).train_test_split(test_size=0.2, seed=42)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
eval_conv.to_csv("../data/eval_dataset.csv", sep="\t", index=None)

In [6]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)

model_name = "NousResearch/Llama-2-7b-chat-hf"
new_model = "llama-2-3b-galaxy-help"

compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4", 
    bnb_4bit_compute_dtype=compute_dtype
)

model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    quantization_config=bnb_config,
    use_cache=False,
    device_map="auto"
)

model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

[2023-11-30 15:22:39,386] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:20<00:00, 10.04s/it]


In [7]:
from peft import get_peft_config, prepare_model_for_kbit_training, get_peft_model, LoraConfig
from trl import SFTTrainer
import sys
import time

#target_modules = ['q_proj','k_proj','v_proj','o_proj','gate_proj','down_proj','up_proj','lm_head']
#or
#target_modules = ['q_proj','v_proj', 'k_proj', 'o_proj']
target_modules = ["q_proj","v_proj"]

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=target_modules
)

print("Extracting parameter efficient model ...")
s_time = time.time()
refined_model = prepare_model_for_kbit_training(model)
refined_model = get_peft_model(refined_model, peft_config)
e_time = time.time()
refined_model.print_trainable_parameters()
print("PEFT loading time: {} seconds".format(e_time - s_time))

base_dir = "llama-test-all-conv"

print("Setting up Training arguments ...")

training_arguments = TrainingArguments(
    output_dir=base_dir,
    evaluation_strategy="steps",
    do_eval=True,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    per_device_eval_batch_size=8,
    optim="adamw_hf", #"paged_adamw_32bit",
    save_steps=50, #change to 500
    logging_steps=50, #change to 100
    learning_rate=1e-4,
    eval_steps=50, #change to 200
    fp16=True,
    max_grad_norm=0.3,
    num_train_epochs=1, # remove "#"
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

print("Setting up SFTTrainer ...")

s_time = time.time()

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=refined_model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    peft_config=peft_config,
    dataset_text_field="conversations",
    max_seq_length=700,
    tokenizer=tokenizer,
    args=training_arguments,
)

e_time = time.time()
print("SFTTTrainer setting up time: {} seconds".format(e_time - s_time))

print("Start training ...")
trainer.train()

#trainer.save_model("saved-model")
#trainer.model.save_pretrained(new_model)
# move this config to checkpoint folder for model reconstruction
refined_model.config.to_json_file("saved-model/config.json")
refined_model.save_pretrained('saved-model')

trainer.save_model()



Extracting parameter efficient model ...
trainable params: 33,554,432 || all params: 6,771,970,048 || trainable%: 0.49548996469513035
PEFT loading time: 0.731330156326294 seconds
Setting up Training arguments ...
Setting up SFTTrainer ...


Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1440/1440 [00:01<00:00, 1385.36 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 360/360 [00:00<00:00, 1724.50 examples/s]
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


SFTTTrainer setting up time: 1.3763797283172607 seconds
Start training ...




Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
tokenizer

In [None]:
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer

# load base LLM model and tokenizer
re_model = AutoPeftModelForCausalLM.from_pretrained(
    base_dir,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
)
re_tokenizer = AutoTokenizer.from_pretrained(base_dir)

In [None]:
'''predictions = []
original_instructions = []
ground_truth_answer = []

start_marker = '<s>[INST]'
end_marker = '[/INST]'
end_tag = "</s>"

for ri, row in eval_conv.iterrows():
    entire_conv = row["conversations"]
    start_index = entire_conv.find(start_marker)
    end_index = entire_conv.find(end_marker)
    instruction = entire_conv[start_index + len(start_marker):end_index].strip()
    prompt = entire_conv[start_index:end_index + len(end_marker)].strip()
    original_answer = entire_conv[end_index + len(end_marker): len(entire_conv) - len(end_tag) - 1].strip()
    original_instructions.append(instruction)
    ground_truth_answer.append(original_answer)
    print("encoding prompt ...")
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to('cuda')
    print("generating response ...")
    outputs = refined_model.generate(input_ids=input_ids, 
        max_new_tokens=156,
        do_sample=True,
    )
    pred = tokenizer.decode(outputs[0])
    predictions.append(pred)
    break

pred_dataframe = pd.DataFrame(zip(original_instructions, ground_truth_answer, predictions), columns=["instructions", "ground truth answers", "generated answers"])
pred_dataframe.to_csv("../data/generated_answers.csv", sep="\t", index=None)

print("Finished generation =======")'''