In [None]:
!pip install accelerate peft bitsandbytes transformers trl

Collecting accelerate
  Downloading accelerate-0.32.1-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting trl
  Downloading trl-0.9.6-py3-none-any.whl (245 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.8/245.8 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from trl)
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m16.0 M

In [None]:
!git clone https://github.com/tcstrength/lawyer-assist.git

fatal: destination path 'lawyer-assist' already exists and is not an empty directory.


In [None]:
import os
import json
import torch
import pandas as pd
from pathlib import Path
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from transformers import TextStreamer
from trl import SFTTrainer

No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'


In [None]:
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

In [None]:
# BASE_MODEL="tcstrength/tinyllama-lawyer-assist-v0"
BASE_MODEL="TinyLlama/TinyLlama-1.1B-Chat-v1.0"
OUTPUT_VERSION="1.0"
OUTPUT_MODEL=f"tinyllama-fine-tuned-v{OUTPUT_VERSION}"
HUGGING_FACE_MODEL="tinyllama-lawyer-assist-v0"

In [None]:
def get_model_and_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype="float16",
        bnb_4bit_use_double_quant=True
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_name, quantization_config=bnb_config, device_map="auto"
    )
    model.config.use_cache=False
    model.config.pretraining_tp=1
    return model, tokenizer

In [None]:
def load_dataset():
    paths = Path("/content/lawyer-assist/data/qna").glob("*.json")
    raw = [json.load(open(x)) for x in paths]
    data = []
    for x in raw:
        data = data + x["data"]
    df = pd.DataFrame(data)
    df = df[["question", "answer"]]
    df["text"] = (
        "<|user|>\n"
        + df["question"]
        + "</s>\n"
        + "<|assistant|>\n"
        + df["answer"]
        + "</s>"
    )
    df = df[df.notna().all(1)]
    df.info()
    data = Dataset.from_pandas(df)
    return data
dataset = load_dataset()

<class 'pandas.core.frame.DataFrame'>
Index: 2149 entries, 0 to 2154
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  2149 non-null   object
 1   answer    2149 non-null   object
 2   text      2149 non-null   object
dtypes: object(3)
memory usage: 67.2+ KB


In [None]:
dataset[0]

{'question': 'Mức phạt tiền đối với hành vi niêm yết tỷ giá mua, bán ngoại tệ nhưng hình thức, nội dung niêm yết tỷ giá không rõ ràng là bao nhiêu?',
 'answer': 'Mức phạt tiền đối với hành vi niêm yết tỷ giá mua, bán ngoại tệ nhưng hình thức, nội dung niêm yết tỷ giá không rõ ràng từ 20.000.000 đồng đến 40.000.000 đồng. Điều này được quy định tại Điểm b Khoản 1 Điều 24 Nghị định 96/2014/NĐ-CP.',
 'text': '<|user|>\nMức phạt tiền đối với hành vi niêm yết tỷ giá mua, bán ngoại tệ nhưng hình thức, nội dung niêm yết tỷ giá không rõ ràng là bao nhiêu?</s>\n<|assistant|>\nMức phạt tiền đối với hành vi niêm yết tỷ giá mua, bán ngoại tệ nhưng hình thức, nội dung niêm yết tỷ giá không rõ ràng từ 20.000.000 đồng đến 40.000.000 đồng. Điều này được quy định tại Điểm b Khoản 1 Điều 24 Nghị định 96/2014/NĐ-CP.</s>',
 '__index_level_0__': 0}

In [None]:
model, tokenizer = get_model_and_tokenizer(BASE_MODEL)



OSError: Can't load tokenizer for 'tcstrength/tinyllama-lawyer-assist-v0'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'tcstrength/tinyllama-lawyer-assist-v0' is the correct path to a directory containing all relevant files for a LlamaTokenizerFast tokenizer.

In [None]:
torch.cuda.empty_cache()

In [None]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=8,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
# if Path(OUTPUT_MODEL).exists():
#     raise RuntimeError("Model already exists, please update model version.")

training_arguments = TrainingArguments(
    output_dir=OUTPUT_MODEL,
    per_device_train_batch_size=3,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    learning_rate=1e-2,
    lr_scheduler_type="cosine",
    save_strategy="epoch",
    logging_steps=24,
    num_train_epochs=3,
    fp16=True,
    # push_to_hub=True
)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    args=training_arguments,
    tokenizer=tokenizer,
    packing=False,
    max_seq_length=1024
)

In [None]:
trainer.train()

In [None]:
from peft import AutoPeftModelForCausalLM, PeftModel
from transformers import AutoModelForCausalLM
import torch
import os

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16,
    load_in_8bit=False,
    device_map="auto",
    trust_remote_code=True
)
model_path = "/content/tinyllama-fine-tuned-v1.0/checkpoint-250"
peft_model = PeftModel.from_pretrained(model, model_path, from_transformers=True, device_map="auto")
model = peft_model.merge_and_unload()

In [None]:
from google.colab import userdata
token = userdata.get('HF_TOKEN_WRITE')

# model.push_to_hub(
#     repo_id="tcstrength/tinyllama-lawyer-assist-v0",
#     token=""
# )

tokenizer.push_to_hub(
    repo_id="tcstrength/tinyllama-lawyer-assist-v0",
    token=token
)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/tcstrength/tinyllama-lawyer-assist-v0/commit/15d9d83702b5e0e395572999da99826a785a9ccd', commit_message='Upload tokenizer', commit_description='', oid='15d9d83702b5e0e395572999da99826a785a9ccd', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
from transformers import GenerationConfig
from time import perf_counter

def formatted_prompt(question)-> str:
    return f"<|user|>\n{question}</s>\n<|assistant|>"

def generate_response(user_input):
    prompt = formatted_prompt(user_input)
    inputs = tokenizer([prompt], return_tensors="pt")
    generation_config = GenerationConfig(
        max_new_tokens=1024,
        pad_token_id=tokenizer.eos_token_id
    )

    streamer = TextStreamer(tokenizer)
    inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
    model.generate(**inputs, streamer=streamer, generation_config=generation_config)

In [None]:
index = 0
print(dataset[index]["question"])
print(dataset[index]["answer"])
generate_response(user_input=dataset[index]["question"])