In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [2]:
import torch
import json
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForCausalLM,
    GenerationConfig
)
from peft import PeftModel
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model_path = 'cache/models--meta-llama--Meta-Llama-3-8B/snapshots/62bd457b6fe961a42a631306577e622c83876cb6'
# lora_path = 'lora/llama3-8B-iepile-data2text-continue'
model_path = 'cache/models--baichuan-inc--Baichuan2-7B-Chat/snapshots/ea66ced17780ca3db39bc9f8aa601d8463db3da5'
lora_path = 'lora/baichuan7B-data2text-continue'
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

  from .autonotebook import tqdm as notebook_tqdm


[2024-09-10 12:12:17,509] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [3]:
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    config=config,
    device_map="cuda",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)
model = PeftModel.from_pretrained(
    model,
    lora_path,
)
model.eval()

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): BaichuanForCausalLM(
      (model): BaichuanModel(
        (embed_tokens): Embedding(125696, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x DecoderLayer(
            (self_attn): Attention(
              (W_pack): Linear(
                in_features=4096, out_features=12288, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=12288, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (o_proj): Linear(
                in_features=4096, out_features=4096, bias=False
       

In [4]:
import gc

del model
gc.collect()
torch.cuda.empty_cache()

In [4]:
file_path = 'data2text/data2text-test-all-v1.json'
with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

In [14]:
def inference(data, output_format=True):
    system_prompt = '<<SYS>>\n 你是一个乐于助人的助手。\n<</SYS>>\n\n'
    input_data = {
        "instruction": "你是一个命名实体识别专家。请从input中抽取符合schema描述的实体，如果实体类型不存在就返回空列表，并输出为可解析的json格式。",
        "schema": data[0]['table']['header'],
        "input": data[0]['text']
    }
    sintruct = json.dumps(input_data, ensure_ascii=False)
    # sintruct = '[INST]' + system_prompt + sintruct + '[/INST]\n'
    # sintruct = '[INST] ' + system_prompt + sintruct + ' [/INST]\n### 期望的输出为：\n'
    # sintruct = system_prompt + sintruct + "\n<|endoftext|>"
    sintruct = '<reserved_106>' + system_prompt + sintruct + '<reserved_107>'
    # sintruct = '<|begin_of_text|>' + system_prompt + sintruct
    # sintruct = {{ system_prompt }} + '<|eot_id|>' + '<|start_header_id|>' + sintruct '<|end_header_id|>'

    input_ids = tokenizer.encode(
        sintruct,
        return_tensors="pt",
    ).to(device)
    input_length = input_ids.size(1)

    # 记录推理开始时间

    if output_format:
        with torch.no_grad():
            generation_output = model.generate(
                input_ids=input_ids,
                generation_config=GenerationConfig(
                    max_length=2048,
                    max_new_tokens=512,
                    return_dict_in_generate=True,
                    # do_sample=True,
                    # temperature=1
                ),
                pad_token_id=tokenizer.eos_token_id)
    else:
        with torch.no_grad():
            generation_output = model.generate(
                input_ids=input_ids,
                generation_config=GenerationConfig(
                    max_length=2048,
                    max_new_tokens=512,
                    return_dict_in_generate=True,
                    do_sample=True,
                    top_p=0.95,
                    top_k=0,
                    # temperature=1
                ),
                pad_token_id=tokenizer.eos_token_id)

    generation_output = generation_output.sequences[0]
    generation_output = generation_output[input_length:]
    output = tokenizer.decode(generation_output, skip_special_tokens=True)
    return output

In [7]:
t = inference(data[0], False)
print(t)

{"时间": ["去年五月"], "人物": [], "国家": ["台湾"], "装备名称": ["笔记本电脑"], "装备类型": [], "装备数量": ["1"], "组织机构": ["海军"], "地点": ["台湾"], "任务": [], "行动": [], "事件": []}


In [8]:
def is_valid_json(text):
    try:
        json.loads(text)
        return True
    except ValueError:
        return False

In [9]:
def jiance(number):
    while True:
        t = inference(data[number], False)
        if is_valid_json(t):
            print(t)
            break  # 输出为有效 JSON，跳出循环
        else:
            print(f"!!!输出 {t} 不符合 JSON 格式，重新生成...")

In [15]:
jiance(408)

!!!输出 {"时间": ["10月13日"], "人物": [], "国家": ["美国"], "装备名称": ["小鹰」号, `独立`号], "装备类型": ["航空母舰"], "装备数量": ["2"], "组织机构": [], "地点": ["北海道小樽市的小樽港"], "任务": [], "行动": [], "事件": []} 不符合 JSON 格式，重新生成...
!!!输出 {"时间": ["10月13日"], "人物": [], "国家": ["美国"], "装备名称": ["小鹰」号, «独立」号], "装备类型": ["航空母舰"], "装备数量": ["1"], "组织机构": [], "地点": ["北海道小樽市的小樽港"], "任务": [], "行动": [], "事件": []} 不符合 JSON 格式，重新生成...
{"时间": ["10月13日"], "人物": [], "国家": ["美国"], "装备名称": ["小鹰」号, «独立」号"], "装备类型": ["航空母舰"], "装备数量": ["2"], "组织机构": [], "地点": ["北海道小樽市的小樽港"], "任务": [], "行动": [], "事件": []}


In [10]:
for i in [408, 632, 1178, 1367, 1796, 2137]:
    jiance(i)

{"时间": ["10月13日"], "人物": [], "国家": ["美国"], "装备名称": ["小鹰」号, «独立»号"], "装备类型": ["航空母舰"], "装备数量": ["1"], "组织机构": [], "地点": ["北海道小樽市的小樽港"], "任务": [], "行动": [], "事件": []}
{"时间": ["3月14日", "2014年至2017年"], "人物": [], "国家": [], "装备名称": ["格里戈洛维奇上将号", "11356型护卫舰"], "装备类型": ["护卫舰"], "装备数量": ["1", "6"], "组织机构": ["黑海舰队"], "地点": ["黑海"], "任务": [], "行动": [], "事件": []}
{"时间": ["2014年底之前"], "人物": [], "国家": ["美国"], "主体名称": ["美国陆军", "洛克希德・马丁公司"], "主体类型": ["军队", "公司"], "主体数量": ["2"], "组织机构": [],"地点": [], "行动": [], "事件": []}
{"时间": ["2000年"], "人物": [], "国家": [], "主体名称": ["依维柯・菲亚特-奥托梅来拉联合企业", "克劳斯-玛菲・威格曼公司"], "主体类型": ["企业", "企业"], "主体数量": ["1", "1"], "组织机构": ["意大利国防部"], "地点": [], "行动": [], "事件": []}
{"时间": ["8月12日"], "人物": [], "国家": ["马来西亚"], "主体名称": ["勒基号、卡斯图里号、潜艇"], "主体类型": ["护卫舰、护卫舰、潜艇"], "主体数量": ["2、2、1"], "组织机构": [], "地点": ["南海"], "行动": [], "事件": []}
{"国别": [], "军种": ["陆军"], "装备数量": [], "装备": ["PLZ-45自行榴弹炮"], "地点": ["华东某训练基地"], "行动": [], "开始时间": ["7月22日"], "任务状态": ["进行中"]}


## baichuan有问题的序号
1. 408
2. 632
3. 1178
4. 1367
5. 1796
6. 2137

## llama有问题的序号
1. 299
2. 2133
3. 2137

In [36]:
try:
    a = json.loads(t)
except json.JSONDecodeError as e:
    print("error")

In [7]:
outputs = []

In [None]:
for index, single_data in enumerate(data):
    try:
        inf_output = inference(single_data)
        inf_output = json.loads(inf_output)
        outputs.append(inf_output)
        # print(inf_output)
    except json.JSONDecodeError as e:
        # 如果遇到无效的JSON字符串，打印错误信息并继续处理其他数据
        print(f"无效的JSON位于序号 {index}: {inf_output}，错误信息: {e}")
        while True:

In [31]:
test_file_path = 'data2text/test_useless_llama3.json'

In [32]:
with open(test_file_path, 'w', encoding='utf-8') as file:
    json.dump(outputs, file, ensure_ascii=False, indent=4)

In [29]:
with open('data2text/test_useless.json', 'r', encoding='utf-8') as file:
    data_baichuan = json.load(file)

dicts = []
for sentence in data_baichuan:
    try:
        parsed_dict = json.loads(sentence)
        dicts.append(parsed_dict)
    except json.JSONDecodeError as e:
        raise ValueError(f"JSON decode error: {e} for item: {sentence}")

JSONDecodeError: Expecting value: line 1 column 1 (char 0)