In [1]:
import torch
import warnings
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

warnings.filterwarnings("ignore")

PATH = "/home/wirl/ytc/TAIDE-LX-7B-Chat"

# ref: https://huggingface.co/blog/4bit-transformers-bitsandbytes
tokenizer = AutoTokenizer.from_pretrained(PATH, local_files_only=True)
model = AutoModelForCausalLM.from_pretrained(PATH, local_files_only=True, load_in_4bit=True, device_map="auto")

eos_token_id = tokenizer.eos_token_id

if torch.cuda.is_available():
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/wirl/anaconda3/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_cuda118.so
CUDA SETUP: CUDA runtime path found: /home/wirl/anaconda3/lib/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /home/wirl/anaconda3/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_cuda118.so...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of LlamaForCausalLM were not initialized from the model checkpoint at /home/wirl/ytc/TAIDE-LX-7B-Chat and are newly initialized: ['model.layers.26.self_attn.rotary_emb.inv_freq', 'model.layers.1.self_attn.rotary_emb.inv_freq', 'model.layers.3.self_attn.rotary_emb.inv_freq', 'model.layers.12.self_attn.rotary_emb.inv_freq', 'model.layers.9.self_attn.rotary_emb.inv_freq', 'model.layers.5.self_attn.rotary_emb.inv_freq', 'model.layers.15.self_attn.rotary_emb.inv_freq', 'model.layers.10.self_attn.rotary_emb.inv_freq', 'model.layers.13.self_attn.rotary_emb.inv_freq', 'model.layers.14.self_attn.rotary_emb.inv_freq', 'model.layers.4.self_attn.rotary_emb.inv_freq', 'model.layers.17.self_attn.rotary_emb.inv_freq', 'model.layers.29.self_attn.rotary_emb.inv_freq', 'model.layers.2.self_attn.rotary_emb.inv_freq', 'model.layers.11.self_attn.rotary_emb.inv_freq', 'model.layers.19.self_attn.rotary_emb.inv_freq', 'model.layers.21.self_attn.rotary_emb.inv_freq', 'model.layers.7.self_attn.rota

Using GPU: NVIDIA GeForce RTX 3080 Ti


In [2]:
# translate_prompt_template = """
# ### Instruction:
# 翻譯成繁體中文: {}
# ### Response:
# """
translate_prompt_template = """
翻譯成繁體中文: {}
輸出:
"""

def get_response(tokenizer, model, prompt_template, sentence_text, remove_input=True):
    device = "cuda:0"
    full_prompt = prompt_template.format(sentence_text)

    # temperature, top_p, and top_k are only active when do_sample=True
    # if you set Top-k to 10, the LLM will only consider the 10 most probable next words. 
    # This will result in more fluent text, but it will also reduce the diversity of the text. 
    # TOP_K = 30
    TOP_K = 50
    # TOP_K = 70
    # If you set Top-p to 0.9, the LLM will only generate words that have a probability of at least 0.9. 
    # This will result in more diverse text, but it could also result in less fluent text.
    TOP_P = 0.95
    # TOP_P = 1.0

    # TEMP = 1.0 # don't specify

    inputs = tokenizer(full_prompt, return_tensors="pt").to(device)

    # outputs = model.generate(**inputs, max_new_tokens=len(sentence_text))
    outputs = model.generate(
        **inputs, 
        max_new_tokens=len(sentence_text), 
        do_sample=True, 
        # temperature=TEMP,
        top_k=TOP_K, 
        top_p=TOP_P, 
        num_return_sequences=1,
    ) # for translation, https://huggingface.co/docs/transformers/tasks/translation#inference

    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if remove_input:
        # 從 generate 出來的 output 中刪除 input text 的部分
        cleaned_output = decoded_output.replace(full_prompt, "")
        return cleaned_output
    else:
        return decoded_output

org_sentence = "网路的使用，拉近了大众和灾区之间的距离，更让偏远地区的农民有了直接面对消费者的管道。"

model_response = get_response(tokenizer, model, translate_prompt_template, org_sentence).strip() # LLM
print(model_response)

譯文: 網路的使用，拉近了大眾和災區的距離，也讓偏遠地區的農民有直接面對消費者的管道。


In [3]:
# prompt_template = """
# ### Instruction:
# 請分析這句話的立場並給予正面/中立/負面的三種分數: 「{}」
# ### Response:
# 情感分析:
# """
# prompt_template = "{}"
prompt_template = "請分析這句話的立場並給予正面/中立/負面的三種分數: 「{}」 情感分析:"

def get_response(tokenizer, model, prompt_template, sentence_text, remove_input=True):
    device = "cuda:0"
    full_prompt = prompt_template.format(sentence_text)

    # temperature, top_p, and top_k are only active when do_sample=True
    # if you set Top-k to 10, the LLM will only consider the 10 most probable next words. 
    # This will result in more fluent text, but it will also reduce the diversity of the text. 
    # TOP_K = 30
    TOP_K = 50
    # TOP_K = 70
    # If you set Top-p to 0.9, the LLM will only generate words that have a probability of at least 0.9. 
    # This will result in more diverse text, but it could also result in less fluent text.
    TOP_P = 0.95
    # TOP_P = 1.0

    # TEMP = 1.0 # don't specify

    inputs = tokenizer(full_prompt, return_tensors="pt").to(device)

    # outputs = model.generate(**inputs, max_new_tokens=len(sentence_text))
    outputs = model.generate(
        **inputs, 
        max_new_tokens=len(sentence_text), 
        do_sample=True, 
        # temperature=TEMP,
        top_k=TOP_K, 
        top_p=TOP_P, 
        num_return_sequences=1,
    ) # for translation, https://huggingface.co/docs/transformers/tasks/translation#inference

    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if remove_input:
        # 從 generate 出來的 output 中刪除 input text 的部分
        cleaned_output = decoded_output.replace(full_prompt, "")
        return cleaned_output
    else:
        return decoded_output

org_sentence = "笑死 林智堅怎麼可能沒有抄襲"
# org_sentence = "你好啊~"

model_response = get_response(tokenizer, model, translate_prompt_template, org_sentence).strip() # LLM
print(model_response)

笑死 林智堅怎麼可能沒有抄襲。
