In [1]:
import os

In [2]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [None]:
from unsloth import FastLanguageModel
import torch
torch.cuda.empty_cache()

In [None]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
#dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit",      # New Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",           # Llama-3 15 trillion tokens model 2x faster!
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"
    "unsloth/Phi-3-mini-4k-instruct",        # Phi-3 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",             # Gemma 2.2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit", #"unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = torch.float16,
    load_in_4bit = load_in_4bit,
    # token = "", # HF Token
)

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
messages = [                    # Change below!
    {"role": "user", "content": "Askerliğe Elverişli Değildir raporum var. Bedelli askerlikten faydalanabilir miyim? "},
]
input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids, streamer = text_streamer, max_new_tokens = 256, pad_token_id = tokenizer.eos_token_id)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
# prompt: load the dataset from the local drive  csv file

import pandas as pd

# Replace 'your_file.csv' with the actual name of your CSV file
df = pd.read_excel('/content/kamu_veri.xlsx')

# Now you can work with the DataFrame 'df'
print(df.head())

In [None]:
# convert df to a datasets dataset

from datasets import Dataset

# Assuming 'df' is your pandas DataFrame
mydataset = Dataset.from_pandas(df)

# Now you can work with the 'dataset' object, which is a datasets Dataset
mydataset

In [None]:
from unsloth import to_sharegpt

mydataset = to_sharegpt(
    mydataset,
    merged_prompt="[[\nYour input is:\n{soru}]]",
    output_column_name="cevap",
    conversation_extension=3,  # Select more to handle longer conversations
)

In [None]:
print(mydataset.column_names)

In [None]:
from unsloth import standardize_sharegpt

mydataset = standardize_sharegpt(mydataset)

In [None]:
mydataset.column_names

In [None]:
print(mydataset)

In [None]:
chat_template = """Below are some instructions that describe some tasks.
Write responses that appropriately complete each request.

### Instruction:
{INPUT}

### Response:
{OUTPUT}"""

from unsloth import apply_chat_template

mydataset = apply_chat_template(
    mydataset,
    tokenizer=tokenizer,
    chat_template=chat_template,
    # default_system_message = "You are a helpful assistant", << [OPTIONAL]
)

In [None]:
print(mydataset.column_names)

In [None]:
print(mydataset.to_pandas().head())

In [None]:
# Split dataset into train and validation
from datasets import DatasetDict

# Split the dataset (80% train, 20% validation)
dataset_split = mydataset.train_test_split(test_size=0.2, seed=3407)
train_dataset = dataset_split['train']
val_dataset = dataset_split['test']

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
import torch
import os

# GPU belleği temizle
torch.cuda.empty_cache()

# Triton sorunları için ortam değişkenleri
os.environ["TRITON_DISABLE_LINE_INFO"] = "1"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

# PyTorch optimizasyonlarını kapat (Triton hatası için)
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
torch._dynamo.config.suppress_errors = True

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,  # Added validation dataset
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 1,  # Colab için 2'den 1'e düşürdük
    packing = False,
    args = TrainingArguments(
        # BATCH SIZE AYARLARI (Colab GPU limiti için)
        per_device_train_batch_size = 1,
        per_device_eval_batch_size = 1,  # Added for validation
        gradient_accumulation_steps = 8,  # 4'ten 8'e çıkardık (etkili batch size = 8)

        # ADIM AYARLARI (Loss 1.5'ten daha iyi sonuç için)
        warmup_steps = 20,        # 5'ten 20'ye çıkardık
        max_steps = 250,          # 120'den 25ə0'ye çıkardık (daha uzun eğitim)
        # num_train_epochs = 1,   # max_steps kullandığımız için kapalı

        # LEARNING RATE (Daha stabil öğrenme için)
        learning_rate = 1e-4,     # 2e-4'ten 1e-4'e düşürdük
        lr_scheduler_type = "cosine",  # Linear'den cosine'e değiştirdik

        # PRECISION (Bellek tasarrufu için)
        fp16 = True, # Changed to True
        bf16 = False, # Changed to False

        # LOGGING VE MONITORING
        logging_steps = 10,       # 1'den 10'a çıkardık (daha less log spam)
        logging_dir = "./logs",

        # OPTIMIZER (Bellek tasarrufu için)
        optim = "adamw_8bit",
        weight_decay = 0.01,

        # EVALUATION (Overfitting kontrolü için)
        eval_strategy = "steps",  # Explicitly set evaluation strategy
        eval_steps = 10,  # Evaluate every 10 steps
        save_strategy = "steps",
        save_steps = 50,
        save_total_limit = 3,     # Sadece son 3 checkpoint'i sakla
        load_best_model_at_end = True,  # Load best model based on validation loss
        metric_for_best_model = "eval_loss",  # Use validation loss as metric
        greater_is_better = False,  # Lower loss is better

        # MEMORY MANAGEMENT (Colab için)
        dataloader_num_workers = 0,     # Multiprocessing kapalı
        dataloader_pin_memory = False,  # Bellek tasarrufu
        torch_compile = False,          # Triton hatası için kapalı

        # GRADIENT CLIPPING (Stabil eğitim için)
        max_grad_norm = 1.0,

        # OUTPUT
        output_dir = "outputs",
        report_to = "tensorboard",  # Loss grafiğini görmek için
        seed = 3407,
    ),
)

# Eğitimi başlat
print("🚀 Eğitim başlıyor...")
print(f"📊 Etkili batch size: {1 * 8} (per_device_batch_size × gradient_accumulation_steps)")
print(f"🎯 Hedef: Loss 1.5'ten daha iyi sonuç")
print(f"⏱️ Maksimum adım: 250 (önceki: 120)")
print(f"📈 Validation tracking: Her 25 adımda bir eval_loss hesaplanacak")

trainer_stats = trainer.train()

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
messages = [                    # Change below!
    {"role": "user", "content": "Askerliğe Elverişli Değildir raporum var. Bedelli askerlikten faydalanabilir miyim?"},
]
input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids, streamer = text_streamer, max_new_tokens = 256, pad_token_id = tokenizer.eos_token_id)

In [None]:
# load HF_TOKEN from colab secrets


# Access the secrets
HF_TOKEN = ""
# Attempt to get token from environment variables

if not HF_TOKEN:
  try:
    # Attempt to get token from secrets
    from google.colab import userdata
    HF_TOKEN = userdata.get('HF_TOKEN')
  except Exception as e:
    print(f"Error retrieving HF_TOKEN from secrets or environment variables: {e}")
    print("Please ensure the HF_TOKEN is set as a secret in Google Colab.")
    # You may choose to raise an exception here or handle the absence of HF_TOKEN differently
    # raise e
    HF_TOKEN = None # Or a default value


# Now you can use the HF_TOKEN
if HF_TOKEN:
  print("HF_TOKEN loaded successfully (from secrets or environment variables).")
  # Your code to use HF_TOKEN goes here...
else:
  print("HF_TOKEN not found. Please ensure it is set as a secret or an environment variable")

In [None]:
model.save_pretrained("kamu_v3_instruct_lora_llama-3-8b")  # Local saving
tokenizer.save_pretrained("kamu_v3_instruct_lora_llama-3-8b")

In [None]:
model.push_to_hub("kullanıcıadı/k_v3-instruct-lora-Llama-3.8", token = HF_TOKEN) # Online saving
tokenizer.push_to_hub("kullanıcıadı/k_v3-instruct-lora-Llama-3.8", token = HF_TOKEN) # Online saving

In [None]:
hf_repo_name="kullanıcıadı"
hf_model_name="kamu-v3.1-llama-3-8b-gguf"
hf_addr= f"{hf_repo_name}/{hf_model_name}"
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf(hf_model_name, tokenizer,)
# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!
if False: model.push_to_hub_gguf(hf_addr, tokenizer, token = HF_TOKEN)

# Save to 16bit GGUF
if False: model.save_pretrained_gguf(hf_model_name, tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf(hf_addr, tokenizer, quantization_method = "f16", token = HF_TOKEN)

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf(hf_model_name, tokenizer, quantization_method = "q4_k_m")
if True: model.push_to_hub_gguf(hf_addr, tokenizer, quantization_method = "q4_k_m", token = HF_TOKEN)

# Save to multiple GGUF options - much faster if you want multiple!
if False:
    model.push_to_hub_gguf(
        hf_addr, # Change hf to your username!
        tokenizer,
        quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
        token = HF_TOKEN, # Get a token at https://huggingface.co/settings/tokens
    )