In [None]:
%pip install tf-keras

In [None]:
%pip install -U pip
%pip install -U bitsandbytes
%pip install -U torch torchvision torchaudio https://download.pytorch.org/whl/cu121
%pip install -U transformers peft datasets accelerate triton


In [None]:
from huggingface_hub import login
login()
#login(token="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")

In [None]:
with open("./dataset.jsonl", "r", encoding="utf-8") as f:
    for i in range(10):
        print(f.readline().strip())

In [None]:
from datasets import load_dataset, Features, Value

features = Features({'input': Value('string'), 'output': Value('string')})

dataset = load_dataset(
    "json",
    data_files={"train": "./dataset.jsonl"},
    features=features
)
print(dataset["train"][0])


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "google/gemma-2b-it"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)


In [None]:
def preprocess(example):
    example["text"] = f"### Input:\n{example['input']}\n### Response:\n{example['output']}"
    return example

dataset = dataset.map(preprocess)

def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["text"],
        truncation=True,
        max_length=128,  # or your choice
        padding="max_length"
    )
    # For Causal LM, labels are the input_ids shifted by one
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs

tokenized_dataset = dataset.map(tokenize_function, batched=True)



In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8, lora_alpha=16, lora_dropout=0.05,
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/gemma-2b-finetuned",
    per_device_train_batch_size=1, 
    num_train_epochs=1,            
    learning_rate=2e-4,
    save_strategy="epoch",
    fp16=True,                   
    report_to="none"
)


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"]
)

trainer.train()


In [None]:
model.save_pretrained("./content/gemma-2b-finetuned")
tokenizer.save_pretrained("./content/gemma-2b-finetuned")
