-
Notifications
You must be signed in to change notification settings - Fork 125
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
在执行“python examples/generate_lora_web.py --base_model knowlm-13b-zhixi”进行基于web的交互效果测试时,输入instruction和input,点击submit之后,output总是出现Error的报错,并且python的运行终端直接结束运行。运行截图如下图: #142
Comments
您好,请确保使用仓库的最新代码,如果仍报错,请提供所使用的gradio版本,我使用的gradio版本是 |
可以帮我一下吗? |
您好,我本地多次测试仍然没有任何问题。根据您的描述来看,控制台没有任何报错直接结束,由于缺乏报错信息,因此很难帮您排出问题。下面是一些可能的建议:
|
您好,根据您的反馈,我找到了相关的issue,可能是网络的问题,您可以参考其中的建议 :) |
您好,根据您提供的相关AUTOMATIC1111/stable-diffusion-webui#9074 |
你好,可以帮我一下吗? |
您好,下面是我的代码,和仓库中的代码相同: import os
import sys
import fire
import gradio as gr
import torch
import transformers
from peft import PeftModel
from transformers import GenerationConfig, AutoModelForCausalLM, AutoTokenizer
from multi_gpu_inference import get_tokenizer_and_model
from typing import List
from callbacks import Iteratorize, Stream
from prompter import Prompter
from utils import Web
if torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
try:
if torch.backends.mps.is_available():
device = "mps"
except: # noqa: E722
pass
# model = None
# tokenizer = None
# web_config = None
def get_kwargs(kwargs, *args):
return [kwargs.get(arg, None) for arg in args]
def main(
load_8bit: bool = True,
load_4bit: bool = False,
base_model: str = None,
model_tag: str = None,
# lora_weights: str = "zjunlp/CaMA-13B-LoRA",
# prompt_template: str = "finetune/lora/knowlm/templates/alpaca.json", # The prompt template to use, will default to alpaca.
server_name: str = "0.0.0.0", # Allows to listen on all interfaces by providing '0.
share_gradio: bool = False,
multi_gpu: bool = False,
allocate: List[int] = None
):
# global model
# global tokenizer
# global web_config
base_model = base_model or os.environ.get("BASE_MODEL", "")
assert (
base_model
), "Please specify a --base_model, e.g. --base_model='huggyllama/llama-7b'"
tokenizer = AutoTokenizer.from_pretrained(base_model)
if device == "cuda":
if multi_gpu:
model, tokenizer = get_tokenizer_and_model(base_model=base_model, dtype="float16", allocate=allocate)
else:
if load_8bit or load_4bit:
device_map = {"":0}
else:
device_map = {"": device}
if load_4bit:
load_8bit = False
model = AutoModelForCausalLM.from_pretrained(
base_model,
load_in_4bit=load_8bit,
load_in_8bit=load_4bit,
torch_dtype=torch.float16,
device_map=device_map,
)
# model = PeftModel.from_pretrained(
# model,
# lora_weights,
# torch_dtype=torch.float16,
# )
elif device == "mps":
model = AutoModelForCausalLM.from_pretrained(
base_model,
device_map={"": device},
torch_dtype=torch.float16,
)
# model = PeftModel.from_pretrained(
# model,
# lora_weights,
# device_map={"": device},
# torch_dtype=torch.float16,
# )
elif device == 'cpu':
model = AutoModelForCausalLM.from_pretrained(
base_model,
torch_dtype=torch.float32,
)
else:
model = AutoModelForCausalLM.from_pretrained(
base_model, device_map={"": device}, low_cpu_mem_usage=True
)
# model = PeftModel.from_pretrained(
# model,
# lora_weights,
# device_map={"": device},
# )
if model_tag == None:
if 'oneke' in base_model.lower():
model_tag = 'oneke'
elif 'knowlm' in base_model.lower():
model_tag = 'zhixi'
else:
assert False, "Please specify the `model_tag`!"
assert model_tag in Web.__SUPPORT_MODEL__
web_config = Web.get_ui(model_tag)
prompter = Prompter(model_name=model_tag)
# unwind broken decapoda-research config
# model.config.pad_token_id = tokenizer.pad_token_id = 0 # pad
# model.config.bos_token_id = tokenizer.pad_token_id = 1
# model.config.eos_token_id = tokenizer.pad_token_id = 2
if not load_8bit and device != "cpu":
model.half() # seems to fix bugs for some users.
model.eval()
if torch.__version__ >= "2" and sys.platform != "win32":
model = torch.compile(model)
def evaluate(
# instruction,
# input=None,
# temperature=0.4,
# top_p=0.75,
# top_k=40,
# num_beams=2,
# max_new_tokens=512,
# repetition_penalty=1.3,
# stream_output=False,
*args
# **kwargs,
):
kwargs = {}
assert len(web_config['var_name']) == len(args), f"{en(web_config['var_name'])} == {len(args)}"
for key, value in zip(web_config['var_name'], args):
kwargs[key] = value
print(kwargs)
generation_var = [
'temperature',
'top_p',
'top_k',
'num_beams',
'max_new_tokens',
'repetition_penalty',
'stream_output'
]
input_var = [
'instruction',
'input',
'schema',
'system_prompt',
]
all_var = input_var + generation_var
instruction, input, schema, system_prompt, \
temperature, top_p, top_k, num_beams, max_new_tokens, repetition_penalty, stream_output = get_kwargs(kwargs, *all_var)
generation_var_args = {}
input_var_args = {}
for cur_var in [(generation_var, generation_var_args), (input_var, input_var_args)]:
_cur_var_list, _cur_arg = cur_var
for var_name in _cur_var_list:
if var_name in kwargs:
_cur_arg[var_name] = kwargs.pop(var_name)
prompt = prompter.generate_prompt(**input_var_args)
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs["input_ids"].to(device)
generation_config = GenerationConfig(
**generation_var_args
# temperature=temperature,
# top_p=top_p,
# top_k=top_k,
# num_beams=num_beams,
# repetition_penalty=repetition_penalty,
# **kwargs,
)
generate_params = {
"input_ids": input_ids,
"generation_config": generation_config,
"return_dict_in_generate": True,
"output_scores": True,
"max_new_tokens": max_new_tokens,
}
if stream_output:
# Stream the reply 1 token at a time.
# This is based on the trick of using 'stopping_criteria' to create an iterator,
# from https://github.com/oobabooga/text-generation-webui/blob/ad37f396fc8bcbab90e11ecf17c56c97bfbd4a9c/modules/text_generation.py#L216-L243.
def generate_with_callback(callback=None, **kwargs):
kwargs.setdefault(
"stopping_criteria", transformers.StoppingCriteriaList()
)
kwargs["stopping_criteria"].append(
Stream(callback_func=callback)
)
with torch.no_grad():
model.generate(**kwargs)
def generate_with_streaming(**kwargs):
return Iteratorize(
generate_with_callback, kwargs, callback=None
)
with generate_with_streaming(**generate_params) as generator:
for output in generator:
# new_tokens = len(output) - len(input_ids[0])
decoded_output = tokenizer.decode(output[1:]) # # `1:` means that we need to filter bos token
if output[-1] in [tokenizer.eos_token_id]:
break
yield prompter.get_response(decoded_output)
return # early return for stream_output
# Without streaming
with torch.no_grad():
generation_output = model.generate(
input_ids=input_ids,
generation_config=generation_config,
return_dict_in_generate=True,
output_scores=True,
max_new_tokens=max_new_tokens,
eos_token_id=tokenizer.eos_token_id
)
s = generation_output.sequences[0]
output = tokenizer.decode(s[1:]) # `1:` means that we need to filter bos token
yield prompter.get_response(output)
gr.Interface(
fn=evaluate,
inputs=web_config['components'],
# inputs=[
# gr.components.Textbox(
# lines=2,
# label="Instruction",
# placeholder="<请在此输入你的问题>",
# ),
# gr.components.Textbox(
# lines=2,
# label="Input",
# placeholder="<可选参数>",
# ),
# gr.components.Slider(
# minimum=0, maximum=1, value=0.4, label="Temperature"
# ),
# gr.components.Slider(
# minimum=0, maximum=1, value=0.75, label="Top p"
# ),
# gr.components.Slider(
# minimum=0, maximum=100, step=1, value=40, label="Top k"
# ),
# gr.components.Slider(
# minimum=1, maximum=4, step=1, value=2, label="Beams"
# ),
# gr.components.Slider(
# minimum=1, maximum=2000, step=1, value=512, label="Max tokens"
# ),
# gr.components.Slider(
# minimum=1, maximum=2, step=0.1, value=1.3, label="Repetition Penalty"
# ),
# gr.components.Checkbox(label="Stream output"),
# ],
outputs=[
gr.Textbox(
lines=5,
label="Output",
)
],
title=web_config['title'],
description=web_config['description'],
).queue().launch(server_name="0.0.0.0", share=share_gradio)
if __name__ == "__main__":
fire.Fire(main)
"""
# multi-gpu
CUDA_VISIBLE_DEVICES=0,1,2,3 python examples/generate_lora_web.py --base_model zjunlp/knowlm-13b-zhixi --multi_gpu --allocate [5,10,8,10]
# single-gpu
CUDA_VISIBLE_DEVICES=0,1,2,3 python examples/generate_lora_web.py --base_model zjunlp/knowlm-13b-zhixi
# testing zhixi
CUDA_VISIBLE_DEVICES=0,1 python examples/generate_lora_web.py --base_model zjunlp/oneke --multi_gpu --allocate [16,16]
# testing oneke
CUDA_VISIBLE_DEVICES=0,1 python examples/generate_lora_web.py --base_model zjunlp/zhixi --multi_gpu --allocate [16,16]
""" 关于第二个问题,应该和torch无关,因为您执行 |
您可以尝试将285行的 |
您好,此处是一个极简代码,您看看是否可以正常执行: import gradio as gr
import time
def generate(inputs: str):
return inputs + f" {time.ctime()}"
def main():
gr.Interface(
fn=generate,
inputs=[
gr.components.Textbox(
lines=2,
label="Instruction",
placeholder="<请在此输入你的问题>",
),
],
outputs=[
gr.Textbox(
lines=5,
label="Output",
)
]
).queue().launch()
if __name__ == '__main__':
main() |
请问您的问题是否已解决? |
|
还没有解决好?不好意思 |
|
Can you help me? |
我的问题还没有解决?请问? |
您试试这个代码: import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# @torch.no_grad()
def generate_response(instruction, text="", temperature=1.0, top_p=0.9, top_k=50, num_beams=1, max_new_tokens=50, repetition_penalty=1.0):
with torch.no_grad():
if text != "":
input_text = f"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{text}\n\n### Response:\n"
else:
input_text = f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n"
input_ids = tokenizer.encode(input_text, return_tensors='pt').to('cuda')
output_ids = model.generate(
input_ids,
max_length=input_ids.shape[1] + max_new_tokens,
temperature=temperature,
top_k=top_k,
top_p=top_p,
num_beams=num_beams,
repetition_penalty=repetition_penalty,
)
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
return output_text[len(input_text):]
if __name__ == '__main__':
model_name = "zjunlp/knowlm-13b-zhixi"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name, torch_dtype=torch.bfloat16, device_map="auto",
load_in_8bit=True
)
interface = gr.Interface(
fn=generate_response,
inputs=[
gr.Textbox(label="Instruction", placeholder="Enter instruction here...", lines=2, value="""从给定的文本中提取出可能的实体和实体类型,可选的实体类型为['地点', '人名'],以(实体,实体类型)的格式回答。"""),
gr.Textbox(label="Optional Text", placeholder="Enter optional text here...", lines=2, optional=True, value="""John昨天在纽约的咖啡馆见到了他的朋友Merry。他们一起喝咖啡聊天,计划着下周去加利福尼亚(California)旅行。他们决定一起租车并预订酒店。他们先计划在下周一去圣弗朗西斯科参观旧金山大桥,下周三去洛杉矶拜访Merry的父亲威廉。"""),
gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, value=1.0, step=0.1),
gr.Slider(label="Top p", minimum=0.0, maximum=1.0, value=0.9, step=0.01),
gr.Slider(label="Top k", minimum=0, maximum=100, value=50, step=1),
gr.Slider(label="Number of Beams", minimum=1, maximum=10, value=1),
gr.Slider(label="Max New Tokens", minimum=1, maximum=512, value=50),
gr.Slider(label="Repetition Penalty", minimum=0.1, maximum=1.6, value=1.0, step=0.1)
],
outputs="text",
title="Zhixi",
description="<center>https://github.com/zjunlp/knowlm</center>"
)
interface.launch() |
请问您的问题解决了吗 |
你好,我执行这段代码的运行结果如下: ===================================BUG REPORT===================================
|
看起来是bitsandbytes的版本问题,我使用的版本是 |
好的,谢谢你 |
请问您还有其他问题吗? |
The text was updated successfully, but these errors were encountered: