In [1]:
!pip install -q -U bitsandbytes
!pip install -q -U datasets
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U loralib
!pip install -q -U einops

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 MB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for accelerate (pyproject.toml) ... [?25l[?25hdon

In [78]:
import os
import torch
# from constant import (
#     MODEL_NAME, R, LORA_ALPHA, TARGET_MODULES, LORA_DROPOUT, BIAS, TASK_TYPE, 
#     MAX_NEW_TOKENS, TEMPERATURE, TOP_P, NUM_RETURN_SEQUENCES
# )
from transformers import (
    AutoConfig, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
)
from peft import (
    LoraConfig, PeftConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# MODEL GENERATE CONFIG
MAX_NEW_TOKENS=200 # max length of generated tokens
TEMPERATURE=0.7 # controls randomness in generation
TOP_P=0.7 # nucleus sampling parameter
NUM_RETURN_SEQUENCES=1 # number of generated sequences to return


def get_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token
    return tokenizer

def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0

    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainables%: {100 * trainable_params / all_param}"
    )

def get_generate_config(tokenizer, model):
    generation_config = model.generation_config
    generation_config.max_new_tokens = MAX_NEW_TOKENS
    generation_config.temperature = TEMPERATURE
    generation_config.top_p = TOP_P
    generation_config.num_return_sequences = NUM_RETURN_SEQUENCES
    generation_config.pad_token_id = tokenizer.eos_token_id
    generation_config.eos_token_id = tokenizer.eos_token_id

    return generation_config

if __name__ == "__main__":
    pass

In [82]:
import torch
import re
# from config_model import (
#     get_tokenizer,
#     get_generate_config,
#     bnb_config,
#     lora_config
# )
# from constant import PROMTP_ANS_FORMAT, FINETUNED_MODEL
# from config_model import bnb_config
from transformers import (
    AutoConfig, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
)
from peft import (
    LoraConfig, PeftConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
)

PROMTP_ANS_FORMAT = """
<|im_start|>system
{}
<|im_end|>
<|im_start|>user
{}
<|im_end|>
<|im_start|>assistant
"""

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

def make_ans_prompt(question, choices=None):
    if choices is not None:
        in_context_key = "2"
        choices = f"""### Các lựa chọn\n{choices}
        """
    else:
        in_context_key = "1"
        choices = ""
    
    instruction = question + "\n" + choices
    instruction = instruction.strip()
    
    prompt = PROMTP_ANS_FORMAT.format(
        IN_CONTEXT_PROMPT[in_context_key],
        instruction
    )

    return prompt

def remove_duplicate_sentences(text):
    sentences = re.split(r'(?<=[.!?])\s+', text)  # Tách câu dựa trên dấu câu
    seen = set()
    filtered_sentences = []
    
    for sentence in sentences:
        if sentence not in seen:  # Chỉ thêm câu nếu nó chưa xuất hiện trước đó
            filtered_sentences.append(sentence)
            seen.add(sentence)
    
    return " ".join(filtered_sentences)


def inference(tokenizer, model, question, choices, generation_config, device="cpu"):
    prompt = make_ans_prompt(question, choices)
    encoding = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.inference_mode():
        outputs = model.generate(
            input_ids=encoding.input_ids,
            attention_mask=encoding.attention_mask,
            generation_config=generation_config
        )
    ans = tokenizer.decode(outputs[0], skip_special_tokens=True)
    processed_ans = remove_duplicate_sentences(ans)
    processed_ans = processed_ans.split("<|im_start|> assistant")
    return processed_ans[1]

def make_inference():
    # USER INPUT

    question = input("Input your question: ").strip()

    choices = input("Choices (Optional): ").strip()
    choices = choices if choices != "" else None

    print("Generating Answer...")
    ans = inference(tokenizer, model, question, choices, generation_config, device=device)
    print(ans)

if __name__ == "__main__":
    
    # LOAD MODEL
    print("Loading Model")
    config = PeftConfig.from_pretrained(FINETUNED_MODEL)

    model = AutoModelForCausalLM.from_pretrained(
        config.base_model_name_or_path,
        return_dict=True,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True
    )

    tokenizer=AutoTokenizer.from_pretrained(config.base_model_name_or_path)
    tokenizer.pad_token = tokenizer.eos_token

    model = PeftModel.from_pretrained(model, FINETUNED_MODEL)

    generation_config = get_generate_config(tokenizer, model)

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    

Loading Model


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [83]:
make_inference()

Input your question:  Natalia đã bán kẹp tóc cho 48 người bạn của cô ấy vào tháng 4, và sau đó cô ấy đã bán nửa số lượng kẹp tóc đó vào tháng 5. Natalia đã bán tổng cộng bao nhiêu kẹp tóc trong tháng 4 và tháng 5?
Choices (Optional):  


Generating Answer...

Natalia đã bán được 48 / 2 = 24 cái kẹp tóc trong tháng 5. Tổng cộng, cô ấy đã bán được 48 + 24 = 72 cái kẹp tóc. Natalia đã bán được 72 cái kẹp tóc trong tháng 4 và tháng 5. N
