# 表格内容生成文本实验

## 设定环境信息

In [4]:
import os
import torch
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForCausalLM
)

os.environ["http_proxy"] = "http://10.6.0.17:8888"
os.environ["https_proxy"] = "http://10.6.0.17:8888"

os.environ["CUDA_VISIBLE_DEVICES"] = "3"

print(f"Using GPU is CUDA:{os.environ['CUDA_VISIBLE_DEVICES']}")

for i in range(torch.cuda.device_count()):
    info = torch.cuda.get_device_properties(i)
    print(f"CUDA:{i} {info.name}, {info.total_memory / 1024 ** 2}MB")

Using GPU is CUDA:3
CUDA:0 NVIDIA GeForce RTX 3090, 24252.6875MB


In [22]:
row = {
    "国别": "中国",
    "军种": "海军",
    "单位": "第一军区第二海军基地驱逐舰第三支队",
    "装备": "052C型驱逐舰，1艘；052D驱逐舰，1艘；054A护卫舰，1艘",
    "人数": "365",
    "指挥员": "舰长 张胜利，政委 许强国",
    "地点": "夏威夷附近海域",
    "行动": "环太平洋-2024演习",
    "开始时间": "上午12时00分00秒",
    "任务状态": "持续进行",
    "任务类型": "联合演习",
    "备注": "西安舰；合肥舰；郴州舰"
}

instruct = "使用以下内容生成报告摘要：" + "，".join(list(map(lambda k: row[k], row)))
print(instruct)

组合以下内容：中国，海军，第一军区第二海军基地驱逐舰第三支队，052C型驱逐舰，1艘；052D驱逐舰，1艘；054A护卫舰，1艘，365，舰长 张胜利，政委 许强国，夏威夷附近海域，环太平洋-2024演习，上午12时00分00秒，持续进行，联合演习，西安舰；合肥舰；郴州舰


## 加载模型

In [None]:
### 使用 Baichuan2-7B-Chat

In [None]:
model_path = "cache/models--baichuan-inc--Baichuan2-7B-Chat/snapshots/ea66ced17780ca3db39bc9f8aa601d8463db3da5"

config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    config=config,
    device_map="cuda",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)

model.eval()

In [None]:
### 使用 T5_pegasus

In [25]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
tokenizer = AutoTokenizer.from_pretrained("lambdarw/t5_pegasus_ch_ans")
model = AutoModelForCausalLM.from_pretrained("lambdarw/t5_pegasus_ch_ans")

input_ids = tokenizer("生成一篇关于冬天的文章", return_tensors="pt").input_ids
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

tokenizer_config.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/766 [00:00<?, ?B/s]

You are using a model of type mt5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

本项目用海类型为“海域用海”,用海方式为“海


## 测试文本生成

In [27]:
input_ids = tokenizer(instruct, return_tensors="pt").input_ids
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

本项目用海类型为“海域用海”,用海方式为“海


In [None]:
inputs = tokenizer(instruct, return_tensors='pt')
inputs = inputs.to('cuda:0')
pred = model.generate(**inputs, max_new_tokens=64, repetition_penalty=1.1)
print(tokenizer.decode(pred.cpu()[0], skip_special_tokens=True))