In [6]:
# ! pip3 install unsloth

In [1]:
import json
from datasets import Dataset

with open("people_data.json", 'r') as f:
    data = json.load(f)

    tuning_examples = []

    for example in data:
        tuning_examples.append(f"<|user|>\n{example['prompt']}\n<|assistant|>\n{json.dumps(example['response'])}<|endoftext|>")

dataset = Dataset.from_dict({'text':tuning_examples})

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from unsloth import FastLanguageModel

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit",
    max_seq_length = 2048, 
    dtype = None, 
    load_in_4bit = True
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model, 
    r = 64, # rank of the matrix, the smaller the rank, the less memory it will use, and the faster the training will be
    target_modules = [
        'q_proj', 'k_proj', 'v_proj','o_proj', 'gate_proj', 'up_proj','down_proj'
    ], # the modules that we want to fine-tune, we're going to inject the LoRA weights into these modules.
     # the reason we're doing this is because these modules are the ones that are doing the heavy lifting in the model.
     # q_proj, k_proj, v_proj are the ones that are doing the key, value, query projection of the input.
     # o_proj is the one that is doing the final projection of the output.
     # gate_proj and up_proj are the ones that are doing the gating and the up-projection of the input.
     # down_proj is the one that is doing the down-projection of the output.

    lora_alpha = 64 * 2, # the scaling factor for the LoRA weights, 64*2 is the default value
    lora_dropout = 0, # the dropout rate for the LoRA weights, 0 is the default value
    bias = 'none', # the bias for the LoRA weights, 'none' is the default value
    use_gradient_checkpointing = 'unsloth' # the gradient checkpointing for the LoRA weights, 'unsloth' is the default value
    
)

In [None]:
from trl import SFTTrainer, SFTConfig

trainer = SFTTrainer(
    model = model, 
    train_dataset = dataset, 
    tokenizer = tokenizer, 
    dataset_text_filed = 'text',
    max_seq_length = 2048,
    args = SFTConfig(
        per_device_train_batch_size = 2, # the batch size for the training, 2 is the default value
        gradient_accumulation_steps = 4, # the gradient accumulation steps for the training, 4 is the default value
        warmup_steps= 10, # the warmup steps for the training, 10 is the default value
        max_steps = 60, # the maximum steps for the training, 60 is the default value
        num_train_epochs = 3, # the number of training epochs, 3 is the default value
        logging_steps= 1, # the logging steps for the training, 1 is the default value
        output_dir = 'outputs', # the output directory for the training, 'outputs' is the default value
        optim = 'adamw_8bit' # the optimizer for the training, 'adamw_8bit' is the default value
    )
    
)

In [None]:
trainer.train()

In [None]:
FastLanguageModel.for_inference(model)

messages = [
    {
        'role': 'user',
        'content': "Mike is a 30 year old programmer. He loves hiking."
    }
]

inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors='pt').to('cuda')

outputs = model.generate(input_ids=inputs, max_new_tokens = 512, use_cache=True, temperature = 0.7, do_sample=True, top_p=0.9)

response = tokenizer.batch_decode(outputs)[0]

print(response)

In [None]:
model.save_pretrained_gguf('fineturned_model', tokenizer, quantization_method='q4_k_m', maximum_memory_usage=0.3)