In [1]:
import os
import torch
import warnings
import pandas as pd
from datetime import datetime
from datasets import load_dataset

from transformers import AutoModelForCausalLM, LlamaTokenizer, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, PeftModel # PEFT = Parameter-Efficient Fine-Tuning
from trl import SFTTrainer # trl = Transformer Reinforcement Learning

warnings.filterwarnings("ignore")

# base model: https://github.com/ymcui/Chinese-LLaMA-Alpaca-2
model_name = "/home/wirl/ytc/chinese-alpaca-2-7b_自己下載的"

tokenizer = LlamaTokenizer.from_pretrained(model_name, local_files_only=True, legacy=True)

# 7B 沒辦法不 quantize 跑
# 8bit 的無法 fine tune
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.float16,
# )
# model = AutoModelForCausalLM.from_pretrained(model_name, local_files_only=True, quantization_config=bnb_config, device_map="auto")

model = AutoModelForCausalLM.from_pretrained(model_name, local_files_only=True, load_in_4bit=True, device_map="auto", pretraining_tp=1)
model.config.use_cache = False

if torch.cuda.is_available():
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/wirl/anaconda3/envs/s_paper/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cpu.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /home/wirl/anaconda3/envs/s_paper/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cpu.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
You are using the legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using GPU: NVIDIA GeForce RTX 3080 Ti


In [2]:
def get_response(model, prompt_template, sentence_text, remove_input=True):
    device = "cuda:0"
    full_prompt = prompt_template.format(sentence_text)

    inputs = tokenizer(full_prompt, return_tensors="pt").to(device)

    # outputs = model.generate(**inputs, max_new_tokens=len(sentence_text))
    outputs = model.generate(**inputs, max_new_tokens=len(sentence_text), do_sample=True, top_k=30, top_p=0.95, num_return_sequences=1) # for translation, https://huggingface.co/docs/transformers/tasks/translation#inference

    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if remove_input:
        # 從 generate 出來的 output 中刪除 input text 的部分
        cleaned_output = decoded_output.replace(full_prompt, "")
        return cleaned_output
    else:
        return decoded_output

In [3]:
# 先看一下還沒 tune 之前的效果，不能翻譯
sentence = "我国古代品评书画艺术的三个等级，即神品、妙品、能品。"
sentence1 = "台中后里位在台湾南北的交会点，隐藏着许多全国知名的景点。"
sentence2 = "我干什么不干你事。"
sentence3 = "我发现太后的头发很干燥。"
sentence4 = "芋头发芽了。"
sentence5 = "再坐在电脑前面 我头发都没了T_T。"
sentence6 = "他觉得丑时人生的通常都比较丑。"

translate_prompt_template = """
### Instruction:
翻譯成繁體中文: {}
### Response:
"""
device = "cuda:0"

print(get_response(model, translate_prompt_template, sentence))
print('---')
print(get_response(model, translate_prompt_template, sentence1))
print('---')
print(get_response(model, translate_prompt_template, sentence2))
print('---')
print(get_response(model, translate_prompt_template, sentence3))
print('---')
print(get_response(model, translate_prompt_template, sentence4))
print('---')
print(get_response(model, translate_prompt_template, sentence5))
print('---')
print(get_response(model, translate_prompt_template, sentence6))
print('---')

神品、妙品、能品是中国古代品评书画艺术的重要等级。在古代，品评书画的等级非常重要
---
台中后里位在台湾南北的交会点，隐藏着许多全国知名的景点。
---
你干嘛不把这些事情留给我来做
---
太后的头发很干燥，我应该做些什么？

---
芋头发芽了
---
为什么头发？电脑会吸走了我的头发？@@
### Instruction: 
---
他觉得丑时人生的通常都比较丑。 他觉得丑时
---


In [4]:
# stanford_alpaca 的 prompt template
# ref: https://zhuanlan.zhihu.com/p/647149346
PROMPT_DICT = {
    "prompt_input": (
        "Below is an instruction that describes a task, paired with an input that provides further context. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{}\n\n### Input:\n{}\n\n### Response:{}"
    ),
    "prompt_no_input": (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{}\n\n### Response:{}"
    ),
}

def format_instruction_prompt(example):
    output_texts = []

    for i in range(len(example['instruction'])):
        if example["input"]:
            text = PROMPT_DICT['prompt_input'].format(example["instruction"][i], example["input"][i], example["output"][i])
        else:
            text = PROMPT_DICT['prompt_no_input'].format(example["instruction"][i], example["output"][i])
        output_texts.append(text)

    return output_texts

In [5]:
# dataset = load_dataset("csv", data_files='instruction_datasets/chinese_cultural_history.csv', split='train')
train_dataset = load_dataset("csv", data_files='instruction_datasets/ministry_of_education_revised_dictionary.csv', split='train')

print("---Train dataset---")
print(train_dataset)

Using custom data configuration default-179e62d222f253db
Found cached dataset csv (/home/wirl/.cache/huggingface/datasets/csv/default-179e62d222f253db/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


---Train dataset---
Dataset({
    features: ['category', 'instruction', 'input', 'output'],
    num_rows: 365461
})


In [6]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [7]:
# from peft import prepare_model_for_kbit_training

save_lora = True

"""--- Training arguments ---"""
output_dir = "./results"

per_device_train_batch_size = 4
gradient_accumulation_steps = 4
optim = "paged_adamw_32bit"
save_steps = 100
logging_steps = 10
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 100
warmup_ratio = 0.03
lr_scheduler_type = "constant"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
    report_to="none" # don't want to use wandb
)

"""--- LoRA config ---"""
lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

lora_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM"
)

print_trainable_parameters(model)


"""--- SFT Trainer (Supervised Fine-tuning Trainer) ---"""
max_seq_length = 512

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    formatting_func=format_instruction_prompt,
    peft_config=lora_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)

for name, module in trainer.model.named_modules():
    if "norm" in name:
        # module = module.to(torch.float32)
        module = module.to(torch.float16)

trainer.train()

if save_lora:
    now = datetime.now()
    # dt_string = now.strftime("%Y_%m_%d-%H_%M_%S")
    lora_adapter_name = "chinese_alpaca2_lora"

    # save adapter only
    output_path = os.path.join(output_dir, lora_adapter_name)

    trainer.model.save_pretrained(output_path)
    tokenizer.save_pretrained(output_path)

trainable params: 453251072 || all params: 3691253760 || trainable%: 12.2790548000688


Loading cached processed dataset at /home/wirl/.cache/huggingface/datasets/csv/default-179e62d222f253db/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-5a2bbf1ca41b6a7c.arrow


Step,Training Loss
10,2.1741
20,1.8353
30,1.5754
40,1.2023
50,0.8152
60,1.4615
70,1.4436
80,1.4156
90,1.1558
100,0.684


In [8]:
# 剛 tune 完所以要可以翻譯
sentence = "我国古代品评书画艺术的三个等级，即神品、妙品、能品。"
sentence1 = "台中后里位在台湾南北的交会点，隐藏着许多全国知名的景点。"
sentence2 = "我干什么不干你事。"
sentence3 = "我发现太后的头发很干燥。"
sentence4 = "芋头发芽了。"
sentence5 = "再坐在电脑前面 我头发都没了T_T。"
sentence6 = "他觉得丑时人生的通常都比较丑。"

translate_prompt_template = """
### Instruction:
翻譯成繁體中文: {}
### Response:
"""
device = "cuda:0"

print(get_response(model, translate_prompt_template, sentence))
print('---')
print(get_response(model, translate_prompt_template, sentence1))
print('---')
print(get_response(model, translate_prompt_template, sentence2))
print('---')
print(get_response(model, translate_prompt_template, sentence3))
print('---')
print(get_response(model, translate_prompt_template, sentence4))
print('---')
print(get_response(model, translate_prompt_template, sentence5))
print('---')
print(get_response(model, translate_prompt_template, sentence6))
print('---')

我國古代品評書畫藝術的三個等級，即神品、妙品、能品。
##
---
台中後里位在台灣南北的交會點，隱藏著許多全國知名的景點。

### Inst
---
為什麼不處理那些不是你事兒
---
我發現太后的頭髮很乾燥。

---
芋头发芽了。
---
再坐在電腦前面。我頭髮都沒了T_T。
###
---
他認為醜時的人生的通常都很醜。
### Instruction
---


In [9]:
now = datetime.now()
dt_string = now.strftime("%Y_%m_%d-%H_%M_%S")

# lora_adapter_name = f"chinese_alpaca2_{dt_string}"
# lora_adapter_name = "chinese_alpaca2_2023_09_10-13_42_45"
# lora_adapter_name = "chinese_alpaca2_2023_09_18-15_40_56"

### Merge and reload model

In [10]:
lora_adapter_path = f"/home/wirl/ytc/要寫的論文研究/code/results/{lora_adapter_name}"
save_tuned_model_path = "/home/wirl/ytc/要寫的論文研究/code/results/chinese-alpaca-2-7b_merged_tuned_model/"

llama2_base_model = AutoModelForCausalLM.from_pretrained(model_name, 
                        device_map={"": "cpu"}, 
                        torch_dtype=torch.float16)

model = PeftModel.from_pretrained(llama2_base_model, 
                        lora_adapter_path, 
                        torch_dtype=torch.float16, 
                        device_map={"": "cpu"})

del llama2_base_model

merged_model = model.merge_and_unload()

merged_model.save_pretrained(save_tuned_model_path)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
del merged_model
del model

In [12]:
# reload model from merged model path
model = AutoModelForCausalLM.from_pretrained(
    save_tuned_model_path,
    device_map="auto",
    load_in_4bit=True
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [22]:
def get_response(model, prompt_template, sentence_text, remove_input=True):
    device = "cuda:0"
    full_prompt = prompt_template.format(sentence_text)

    inputs = tokenizer(full_prompt, return_tensors="pt").to(device)

    outputs = model.generate(**inputs, max_new_tokens=len(sentence_text))
    # outputs = model.generate(**inputs, max_new_tokens=len(sentence_text), do_sample=True, top_k=30, top_p=0.95, num_return_sequences=1) # for translation, https://huggingface.co/docs/transformers/tasks/translation#inference

    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if remove_input:
        # 從 generate 出來的 output 中刪除 input text 的部分
        cleaned_output = decoded_output.replace(full_prompt, "")
        return cleaned_output
    else:
        return decoded_output

In [23]:
# load 完之後希望是可以翻譯的
sentence = "我国古代品评书画艺术的三个等级，即神品、妙品、能品。"
sentence1 = "台中后里位在台湾南北的交会点，隐藏着许多全国知名的景点。"
sentence2 = "我干什么不干你事。"
sentence3 = "我发现太后的头发很干燥。"
sentence4 = "芋头发芽了。"
sentence5 = "再坐在电脑前面 我头发都没了T_T。"
sentence6 = "他觉得丑时人生的通常都比较丑。"

translate_prompt_template = """
### Instruction:
翻譯成繁體中文: {}
### Response:
"""
device = "cuda:0"

print(get_response(model, translate_prompt_template, sentence))
print('---')
print(get_response(model, translate_prompt_template, sentence1))
print('---')
print(get_response(model, translate_prompt_template, sentence2))
print('---')
print(get_response(model, translate_prompt_template, sentence3))
print('---')
print(get_response(model, translate_prompt_template, sentence4))
print('---')
print(get_response(model, translate_prompt_template, sentence5))
print('---')
print(get_response(model, translate_prompt_template, sentence6))
print('---')

我國古代品評書畫藝術的三個等級，即神品、妙品、能品。
##
---
台中后里位在台湾南北的交会点，隐藏着许多全国知名的景点。
### Input:
翻
---
幹什麼不幹你的事。
##
---
我發現太后的頭髮很乾燥。

---
芋头发芽了。

---
再坐在電腦前面我頭髮都沒了T_T。
### Input
---
他覺得醜時人生的通常比較醜。
### Inst
---


In [14]:
sentence = "AI是什麼"
prompt = f'''
### Instruction:
{sentence}
### Response:
'''

print("*** Generate:")

input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids, do_sample=True, top_k=30, top_p=0.95, max_new_tokens=20)
print(tokenizer.decode(output[0]))

*** Generate:
<s>
### Instruction:
AI是什麼
### Response:
**人工智能（Artificial Intelligence，简称AI）** 指的是计算机系统具有人类智能


In [15]:
sentence = "你好 你是誰"
prompt = f"""
### Instruction:
{sentence}
### Response:
"""
device = "cuda:0"

inputs = tokenizer(prompt, return_tensors="pt").to(device)
outputs = model.generate(
    **inputs,
    do_sample=True, top_k=30, top_p=0.95,
    max_new_tokens=20
)

decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

# 從 generate 出來的 output 中刪除 input text 的部分
cleaned_output = decoded_output.replace(prompt, "").strip()
print(cleaned_output)

### Comment:不,我是一個人類。佷謝問. ��


In [16]:
# TODO: 怎麼拿到測試資料 (參見 onenote 筆記)
test_dataset = load_dataset('parquet', data_files='corpus/_正體(繁體)/zetavgcoct_en-zh-tw-translations-twp-300k.parquet', split='train')

print("---Test dataset---")
print(test_dataset)

Using custom data configuration default-ff1fb7e1820c5a4d
Found cached dataset parquet (/home/wirl/.cache/huggingface/datasets/parquet/default-ff1fb7e1820c5a4d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


---Test dataset---
Dataset({
    features: ['en', 'ch'],
    num_rows: 310916
})


In [31]:
# convert output to pandas dataframe
test_dataset.set_format(type='pandas', columns=['en', 'ch'])

test_df = test_dataset[:]
test_df

Unnamed: 0,en,ch
0,While the China Times Group has decided to dro...,而在中時報系以不堪虧損為由捨棄晚報的同時，另方面卻持續入股中天電視台，並有意在未來收購中視，...
1,The ten years after the war were a golden age ...,終戰後的十餘年間，可說是歌仔戲的黃金時代，人才輩出，除了活躍於戲院舞台的「內台戲」外，還有「...
2,"Civilian art, which is characterized by its no...",「國民美術」以非學院派美術基調的發展過程，巧妙地與 1998 年國內社區大學興起的「解放知識...
3,"""Civilian Art is like a harvest festival in wh...",汐止社大主任潘英海表示：「國民美術像集體參與的美術豐年祭，劉秀美在社區成立畫會，建構集體記憶...
4,"For example, Zheng Jionghui's grandfather ran ...",例如日治時期祖父在金瓜石經營「鈔利搗礦場」的鄭炯輝，將當時用水車淘洗金砂的過程，一一用圖畫記...
...,...,...
310911,Who can resist capitalism?,誰來對抗資本主義？
310912,"I was terribly scared! """,在西畫荒漠中披荊斬棘
310913,"I'm scared! """,我怕！」
310914,How subjective and arrogant that is!,多麼主觀傲慢！


In [32]:
import opencc

t2s_converter = opencc.OpenCC('t2s.json')

"""traditional chinese to simplified chinese"""
def convert_TC_to_SC(zhtw_text: str, converter=None):
    if not converter:
        converter = opencc.OpenCC('t2s.json')
    return converter.convert(zhtw_text)

ModuleNotFoundError: No module named 'opencc'

In [34]:
!pip3 install opencc

[0m[31mERROR: Could not find a version that satisfies the requirement opencc (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for opencc[0m[31m
[0m

In [17]:
# TODO: 怎麼讓他有 history
# TODO: 到底什麼是 QLoRA，怎麼實作 => load model in 4bit and merge with lora weights
