In [None]:
!pip install -U bitsandbytes accelerate transformers peft datasets


In [2]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()


In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

model_id = "codellama/CodeLlama-7b-Instruct-hf"

# Config QLoRA 4-bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,   
    device_map="auto"
)


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/646 [00:00<?, ?B/s]

2025-09-18 17:34:00.954656: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758216841.278742      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758216841.379711      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [4]:
from peft import LoraConfig, get_peft_model

# Config QLoRA
lora_config = LoraConfig(
    r=16,                      
    lora_dropout=0.05,
    lora_alpha=32,
    bias="none",
    task_type="CAUSAL_LM"     
)

# Gắn adapter vào model
model = get_peft_model(model, lora_config)


In [5]:
from datasets import load_dataset

# Load tất cả file json thành 1 Dataset
dataset = load_dataset(
    "json",
    data_files="/kaggle/input/eia-vietnamese-to-python-code/Data/*.json",  
    split="train"
)

# Tách 90% train, 10% test
dataset = dataset.train_test_split(test_size=0.1, seed=42)

print(dataset)

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['instruction', 'response'],
        num_rows: 1179
    })
    test: Dataset({
        features: ['instruction', 'response'],
        num_rows: 132
    })
})


In [6]:
from datasets import load_dataset


def preprocess(example):
    prompt = f"### Instruction:\n{example['instruction']}\n### Response:\n{example['response']}"
    # tokens = tokenizer(prompt, truncation=True, padding="max_length", max_length=512)
    tokens = tokenizer(
    prompt,
    truncation=True,
    max_length=512,
    padding="longest"   
)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_dataset = dataset.map(preprocess)


Map:   0%|          | 0/1179 [00:00<?, ? examples/s]

Map:   0%|          | 0/132 [00:00<?, ? examples/s]

In [7]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
import os
os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=1e-4,
    num_train_epochs=10,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine", 
    logging_steps=10,
    save_strategy="epoch",
    bf16=True,     # dùng bf16 nếu GPU hỗ trợ
    optim="paged_adamw_32bit"
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
    # DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8)
)


trainer.train()


Step,Training Loss
10,1.6798
20,1.6718
30,1.5901
40,1.4515
50,1.2867
60,0.9654
70,0.7849
80,0.6528
90,0.5718
100,0.4838


TrainOutput(global_step=1480, training_loss=0.21177673972136266, metrics={'train_runtime': 24646.3431, 'train_samples_per_second': 0.478, 'train_steps_per_second': 0.06, 'total_flos': 4.63551747969024e+16, 'train_loss': 0.21177673972136266, 'epoch': 10.0})

In [9]:
trainer.save_model("./exmodel")
tokenizer.save_pretrained("./exmodel")

('./exmodel/tokenizer_config.json',
 './exmodel/special_tokens_map.json',
 './exmodel/chat_template.jinja',
 './exmodel/tokenizer.model',
 './exmodel/added_tokens.json',
 './exmodel/tokenizer.json')

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig


model_id = "/kaggle/working/results/checkpoint-65"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    llm_int8_enable_fp32_cpu_offload=True
)

max_memory = {0: "12GiB", "cpu": "48GiB"}  # T4 chỉ có 16GB, để lại 4GB buffer


model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    max_memory=max_memory
)

tokenizer = AutoTokenizer.from_pretrained(model_id)


In [None]:
def generate_code(prompt, max_new_tokens=200):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        # do_sample=True,
        # temperature=0.7,
        # top_p=0.9,
        do_sample=False,
        num_beams=2
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [None]:
import gradio as gr

def chat(instruction):
    prompt = f"### Instruction:\n{instruction}\n\n### Response:\n"
    return generate_code(prompt)

demo = gr.Interface(
    fn=chat,
    inputs=gr.Textbox(lines=3, placeholder="Nhập lệnh: Tính tổng cột A..."),
    outputs="text",
    title="Excel AI Assistant"
)

# demo.launch()
demo.launch(share=True)