In [None]:
!pip install -q -U transformers datasets bitsandbytes trl peft evaluate accelerate

In [2]:
from datasets import load_dataset , DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from trl import setup_chat_format, SFTTrainer , SFTConfig
from peft import LoraConfig
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
import torch
import gc

In [3]:
peft_config = LoraConfig(
        lora_alpha=128,
        lora_dropout=0.05,
        r=8,
        bias="none",
        target_modules="all-linear",
        task_type="CAUSAL_LM",
)

In [4]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype= torch.bfloat16
)

In [None]:
model_id = "ytu-ce-cosmos/Turkish-Llama-8b-DPO-v0.1"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    quantization_config=quantization_config
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.padding_side = 'right'

In [None]:
data = load_dataset("yusufbaykaoglu/turkish-finance-dataset")

In [7]:
train_data = data['train'].select(range(9000))
data_30 = DatasetDict({'train': train_data})
dataset = Dataset.from_pandas(pd.DataFrame(data_30['train']))

In [None]:
system_message = "Sen bir finans uzmanısın. Görevin, uluslararası satışlarda elde edilen gelirin yüzdesine göre finansla ilgili sorulara doğru ve anlayışlı cevaplar vermek."

def create_conversation(row):
    messages = [{"role": "system", "content": system_message}]

    if isinstance(row['translated_messages'], list):
        for message in row['translated_messages']:
            if isinstance(message, dict) and 'role' in message and 'content' in message:
                messages.append(message)
            elif isinstance(message, str):
                messages.append({"role": "user", "content": message})
    elif isinstance(row['translated_messages'], str):

        messages.append({"role": "user", "content": row['translated_messages']})

    return {"messages": messages}

dataset = dataset.train_test_split(test_size=0.05)
dataset = dataset.map(create_conversation, batched=False, remove_columns=dataset["train"].column_names)

print("Dataset: ", dataset["train"][0]["messages"])

In [9]:
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False

In [10]:
tokenizer
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [None]:
gc.collect()
torch.cuda.empty_cache()

args = TrainingArguments(
    output_dir="cosmos-chat-finance",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    optim="adamw_torch_fused",
    logging_steps=25,
    save_strategy="epoch",
    learning_rate=2e-4,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
)

max_seq_length=1028


trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=False,
    dataset_kwargs = {
    "add_special_tokens": True,
    "append_concat_token": False,
},
)

trainer.train()
trainer.save_model()

### Usage

In [None]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM , AutoTokenizer

config = PeftConfig.from_pretrained("yusufbaykaoglu/turkish-finance-chat")
base_model = AutoModelForCausalLM.from_pretrained("ytu-ce-cosmos/Turkish-Llama-8b-DPO-v0.1")
model = PeftModel.from_pretrained(base_model, "yusufbaykaoglu/turkish-finance-chat")
model_id ="ytu-ce-cosmos/Turkish-Llama-8b-DPO-v0.1" 
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
import torch

input_text = "Faiz politikası hangi durumlarda değişir?"
inputs = tokenizer.encode_plus(input_text, return_tensors="pt", truncation=True)

model.to("cuda")
inputs = {key: value.to("cuda") for key, value in inputs.items()}

with torch.no_grad():
    outputs = model.generate(**inputs, max_length=128)

generated_text = tokenizer.decode(outputs[0])
print(generated_text)