In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from datasets import load_dataset
from trl import SFTTrainer
from jinja2 import Template
import yaml
import os
os.environ["CUDA_VISIBLE_DEVICES"] = '1'


In [None]:
MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"  # @param {type:"string"}
NEW_MODEL_NAME = "opus-samantha-phi-3-mini-4k"  # @param {type:"string"}
DATASET_NAME = "macadeliccc/opus_samantha"  # @param {type:"string"}
SPLIT = "train"  # @param {type:"string"}
MAX_SEQ_LENGTH = 2048  # @param {type:"integer"}
num_train_epochs = 1  # @param {type:"integer"}
license = "apache-2.0"  # @param {type:"string"}
username = "macadeliccc"  # @param {type:"string"}
learning_rate = 1.41e-5  # @param {type:"number"}
per_device_train_batch_size = 1  # @param {type:"integer"}
gradient_accumulation_steps = 1  # @param {type:"integer"}

model = AutoModelForCausalLM.from_pretrained(MODEL_ID, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
dataset = load_dataset("macadeliccc/opus_samantha", split="train")

In [None]:
EOS_TOKEN=tokenizer.eos_token_id

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = []
    mapper = {"system": "system\n", "human": "\nuser\n", "gpt": "\nassistant\n"}
    end_mapper = {"system": "", "human": "", "gpt": ""}
    for convo in convos:
        text = "".join(f"{mapper[(turn := x['from'])]} {x['value']}\n{end_mapper[turn]}" for x in convo)
        texts.append(f"{text}{EOS_TOKEN}")  # Since there are multi-turn
        # conversations, I append the EOS_TOKEN at the end of the whole
        # conversation. These conversations always end with a gpt message.
    return {"text": texts}

dataset = dataset.map(formatting_prompts_func, batched=True)
print(dataset['text'][8])

In [None]:
args = TrainingArguments(
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=True,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    max_steps=-1,
    num_train_epochs=num_train_epochs,
    save_strategy="no",
    logging_steps=1,
    output_dir=NEW_MODEL_NAME,  # Changed to a string value
    optim="paged_adamw_32bit",
    bf16=True,
)

In [None]:
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LENGTH,
    formatting_func=formatting_prompts_func
)

trainer.train()