<a href="https://colab.research.google.com/github/weyesnocommit/cw/blob/main/Open_Character_AI_Best_Quality.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [41]:
!pip install -q -U transformers datasets accelerate peft trl bitsandbytes

In [42]:
import os
os.environ["HF_TOKEN"] = "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"

In [43]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig

Change The Dataset To Select Your Specific Model:
- Star Wars Jedi: "TuringsSolutions/Jedi500"
- Al Bundy: "TuringsSolutions/AlBundy500PlusPFAF400"
- Creative Writer Bot 5000: "TuringsSolutions/CreativeWriterBot5000"
- Batman AI: "TuringsSolutions/BatmanAI"
- Comedian Bot 5000: "TuringsSolutions/ComedianBot5000"

In [44]:
 # Model
base_model = "./content/results/checkpoint-21"
#Fine-tune model name
new_model = "phi2/OpenCharacterAI"
#Load the Dataset from hugging face
dataset = load_dataset("TuringsSolutions/AlBundy500",split="train")
#Tokenizer

tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)
tokenizer.pad_token=tokenizer.eos_token
tokenizer.padding_side="right"

def preprocess_function(examples):
    # Adjust the data to match what the model expects
    return tokenizer(examples["Response"], truncation=True, padding="max_length", max_length=400)

# Apply preprocessing to your datasets
train_dataset = dataset.map(preprocess_function, batched=True)
eval_dataset = dataset.map(preprocess_function, batched=True)

README.md:   0%|          | 0.00/77.0 [00:00<?, ?B/s]

Al Bundy 500 - Sheet1.csv:   0%|          | 0.00/117k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/503 [00:00<?, ? examples/s]

OSError: Incorrect path_or_model_id: './content/results/checkpoint-21'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [None]:
dataset

In [None]:
print(dataset.to_pandas().iloc[0][['Prompt', 'Response']])

In [None]:
pip install einops

**If you get an Accelerate error here, just restart the Runtime and re-run all the cells.**

In [None]:
#Configration of QLoRA
#Quantization Configuration
#To reduce the VRAM usage we will load the model in 4 bit precision and we will do quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    #Quant type
    #We will use the "nf4" format this was introduced in the QLoRA paper
    bnb_4bit_quant_type="nf4",
    #As the model weights are stored using 4 bits and when we want to compute its only going to use 16 bits so we have more accuracy
    bnb_4bit_compute_dtype=torch.float16,
    #Quantization parameters are quantized
    bnb_4bit_use_double_quant=False,
)

# LoRA configuration
peft_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules= ["Wqkv", "fc1", "fc2" ] # ["Wqkv", "out_proj", "fc1", "fc2" ], - 41M params
    # modules_to_save=["embed_tokens","lm_head"]
)

# Load base moodel
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    # use_flash_attention_2=True, # Phi does not support yet.
    trust_remote_code=True,
    flash_attn=True,
    flash_rotary=True,
    fused_dense=True,
    low_cpu_mem_usage=True,
    device_map={"": 0},
    revision="refs/pr/23",
)


model.config.use_cache = False
model.config.pretraining_tp = 1

# Cast the layernorm in fp32, make output embedding layer require grads, add the upcasting of the lmhead to fp32
#prepare_model_for_kbit_training---> This function basically helps to built the best model possible
model = prepare_model_for_kbit_training(model,use_gradient_checkpointing=True)

In [None]:
# Set training arguments
!tar -czvf XDDDDDDDDd /content/results/checkpoint-21

training_arguments = TrainingArguments(
        output_dir="./results",
        num_train_epochs=3,#3,5 good for the Llama 2 Model
        per_device_train_batch_size=2,# Number of batches that we are going to take for every step
        gradient_accumulation_steps=32,
        evaluation_strategy="steps",#Not helpful because we donot want to evaluate the model we just want to train it
        eval_steps=2000,
        logging_steps=25,
        optim="paged_adamw_8bit",#Adam Optimizer we will be using but a version that is paged and in 8 bits, so it will lose less memory
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        warmup_steps=10,
        warmup_ratio=0.05,
        report_to="tensorboard",
        weight_decay=0.01,
        max_steps=-1, # if maximum steps=2, it will stop after two steps
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,#No separate evaluation dataset, i am using the same dataset
    peft_config=peft_config,
    #dataset_text_field="Response",
    #max_seq_length=400,# In dataset creation we put a threshold 2k for context length (input token limit) but we dont have enough VRAM unfortunately it will take a lot of VRAM to put everything into memory so we are just gonna stop at 512
    tokenizer=tokenizer,
    args=training_arguments,
)

# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)

In [None]:
# Run text generation pipeline with our model
#Input Prompt

#prompt = "What is a large language model?"
#Wrap the prompt using the right chat template
instruction = """Can you tell me about the time you scored 4 touchdowns in a single game?"""
#Using Pipeline from the hugging face
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=300)
result = pipe(instruction)
#Trim the response, remove instruction manually
print(result[0]['generated_text'][len(instruction):])