In [38]:
import os, time, re, random, glob, json, jieba, copy
import numpy as np
import pandas as pd
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    default_data_collator,
    TextGenerationPipeline
)

In [2]:
# device="cuda:0"
device="cpu"

In [3]:
CLEAN_TEXT_PATTERN = re.compile(r"[\r\n]")

def clean_text(text):
    return CLEAN_TEXT_PATTERN.sub("", text)

# SFT

In [3]:
import sentencepiece
model_file = "/Users/zeyesun/Documents/Data/models/pangu-350M/vocab.model"
sp = sentencepiece.SentencePieceProcessor()
sp.Load(model_file=model_file)
for i in range(10):
    print(sp.id_to_piece(i))

True

In [20]:
# model_name_or_path = "D:\\Data\\models\\pangu_2_6B"
# model_name_or_path = "D:\\Data\\models\\pangu-350M"
# model_name_or_path = "/Users/zeyesun/Documents/Data/models/pangu-350M"
model_name_or_path = "/Users/zeyesun/Documents/Data/models/pangu-2.6B"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
tokenizer.add_special_tokens({
    'unk_token': "<unk>", 
    'eos_token': "<eot>", 
    'pad_token': "<pad>", 
    "sep_token": "<sep>"
})

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


0

### Data Processing

In [92]:
# weibo_summary_comments_json
t = time.time()
fi = "/Users/zeyesun/Documents/Data/raw/weibo_summary_comments_json.json"
fo = "/Users/zeyesun/Documents/Data/chatgpt/processed/weibo_summary_comments.jsonl"
ct = 0
with open(fo, "w", encoding="utf-8") as w:
    with open(fi, "r", encoding="utf-8") as r:
        while True:
            line = r.readline()
            if not line:
                break
            
            item = json.loads(line.strip("\n"))
            article = item['article'].replace(" ", "")
            abstract = item['abstract'].replace(" ", "")
            prompt = f"新闻内容：{article}{tokenizer.sep_token}摘要：{abstract}{tokenizer.sep_token}评论："
            answers = [
                {
                    "answer": k.replace(" ", ""), 
                    "score": int(v)
                } for (k, v) in sorted(item['comments'], key=lambda x: (int(x[1]), len(x[0])), reverse=True)
            ]
            w.write(json.dumps({"prompt": prompt, "answers": answers}, ensure_ascii=False)+'\n')
            ct += 1
print(f"length: {ct}, time taken: {time.time()-t} s")

length: 894732, time taken: 68.99229574203491 s


In [97]:
# couplets
t1 = time.time()
fi = "/Users/zeyesun/Documents/Data/raw/couplets.txt"
fo = "/Users/zeyesun/Documents/Data/chatgpt/processed/couplets.jsonl"
l2 = []
nexts = dict()
with open(fi, "r", encoding="utf-8") as r:
    while True:
        line = r.readline()
        if not line:
            break
        line = line.strip("\n")
        idx = len(line) // 2
        prompt = line[:idx]
        answer = line[idx+1:]
        answers = [{"answer": answer, "score": 1}]
        l2.append({"prompt": f"上联：{prompt}{tokenizer.sep_token}下联：", "answers": answers})
        length = len(answer)
        if length not in nexts:
            nexts[length] = list()
        nexts[length].append(answer)
t2 = time.time()
print(f"length: {len(l2)}, # different lengths: {len(nexts)}, time taken: {t2-t1} s")
with open(fo, "w", encoding="utf-8") as w:
    for i, l in enumerate(l2):
        answer = l['answers'][0]
        length = len(answer['answer'])
        nexts_tmp = copy.deepcopy(nexts[length])
        nexts_tmp.remove(answer['answer'])
        false_answers_1 = [{"answer": fa, "score": 0} for fa in random.sample(nexts_tmp, 2)]
        keys = set(nexts.keys())
        keys.remove(length)
        false_answers_2 = [{"answer": random.choice(nexts[key]), "score": -1} for key in random.sample(keys, 2)]
        answers = [answer] + false_answers_1 + false_answers_2
        answers = sorted(answers, key=lambda x: x['score'], reverse=True)
        w.write(json.dumps({"prompt": l['prompt'], "answers": answers}, ensure_ascii=False)+'\n')
        if i % 1000 == 0:
            print(f"{i} samples processed, time taken: {time.time()-t2} s")
print(f"length: {len(l2)}, time taken: {time.time()-t2} s")

length: 774491, # different lengths: 32, time taken: 3.362332820892334 s


In [101]:
# zhidao
t = time.time()
fp = "/Users/zeyesun/Documents/Data/raw/zhidao/*.csv"
fo = "/Users/zeyesun/Documents/Data/chatgpt/processed/zhidao.jsonl"
ct = 0
with open(fo, "w", encoding="utf-8") as w:
    for fi in glob.glob(fp):
        ct = 0
        df = pd.read_csv(fi).sort_values(by=["title", "is_best"], ascending=False)
        prev_title = None
        prev_prompt = None
        for _, val in df.iterrows():
            if isinstance(val['question'], str) and val['question'] != val['title']:
                prompt = f"问题：{val['title']}{tokenizer.sep_token}内容：{val['question']}{tokenizer.sep_token}回答："
            else:
                prompt = f"问题：{val['title']}{tokenizer.sep_token}回答："
            if prev_title is not None and prev_title == val['title']:
                answers.append({"answer": val['reply'], "score": val['is_best']})
            else:
                if prev_title is not None:
#                     l3.append({"prompt": prev_prompt, "answers": copy.deepcopy(answers)})
                    w.write(json.dumps({"prompt": prev_prompt, "answers": answers}, ensure_ascii=False)+'\n')
                answers = [{"answer": val['reply'], "score": val['is_best']}]
            prev_prompt = prompt
            prev_title = val['title']
            ct += 1
#         l3.append({"prompt": prev_prompt, "answers": copy.deepcopy(answers)})
        w.write(json.dumps({"prompt": prev_prompt, "answers": answers}, ensure_ascii=False)+'\n')
        print(f"finished processing {os.path.basename(fi)}")
print(f"length: {ct}, time taken: {time.time()-t} s")

finished processing financezhidao_filter.csv
finished processing liantongzhidao_filter.csv
finished processing touzizhidao_filter.csv
finished processing nonghangzhidao_filter.csv
finished processing baoxianzhidao_filter.csv
finished processing anhuidianxinzhidao_filter.csv
finished processing lawzhidao_filter.csv
length: 36368, time taken: 127.75226378440857 s


In [102]:
f = "/Users/zeyesun/Documents/Data/raw/yf_amazon/categories.csv"
dfc = pd.read_csv(f)
f = "/Users/zeyesun/Documents/Data/raw/yf_amazon/ratings.csv"
dfr = pd.read_csv(f)
f = "/Users/zeyesun/Documents/Data/raw/yf_amazon/products.csv"
dfp = pd.read_csv(f)

In [111]:
dfc['category'].unique().tolist()

['商务皮鞋',
 '小说',
 '戏剧/综艺',
 '十字绣',
 '针织衫',
 '天然琥珀/珍珠/玉石',
 '瑜伽包',
 '急救护理用品',
 '男士单肩包',
 '包装设备',
 '其它户外用品',
 '其他运动器材',
 '施华洛世奇/其他水晶',
 '足球鞋',
 '凉拖',
 '防盗/报警/监控器材及系统',
 '方便/速食/真空即食',
 '电子词典/文曲星',
 '水上用品',
 '手镯',
 '高尔夫球相关',
 '连身裤',
 '程序/软件开发',
 '暖贴/热水袋',
 '笔记本/平板配件',
 '沐浴用品',
 '男士防晒',
 '宠物相关',
 '户外其他',
 '卡通/动漫',
 '数据传输',
 '摄影摄像',
 '电子计步器',
 '明星纪念品/主题系列收藏',
 '睡衣/睡袍/家居服',
 '手巾/口水巾/三角巾',
 '鞋盒',
 '牛初乳',
 '其它用品',
 '奶粉',
 '门窗',
 '鱼类及其用品',
 '功能眼镜',
 '计算机/网络',
 '沙发类',
 '亲子/家教',
 '摩托车相关',
 '民族拉弦乐器',
 '室内休闲运动',
 '牌/章/卡/票',
 '婴幼宝贝服饰',
 '单鞋',
 '家用理疗仪',
 '挂饰/壁饰',
 '手机耳机',
 '染发膏',
 '猫/狗梳理美容',
 '其他',
 '男士斜挎包/胸包',
 '镜头',
 '牙刷',
 '电动/遥控玩具',
 '空气净化/氧吧',
 '婴儿安全用品',
 '舞台影音',
 '贺卡/明信片',
 '其他家居饰品',
 '食品/保健',
 '速溶咖啡/咖啡豆/粉',
 '电工配件',
 '板鞋',
 '足浴器',
 '家庭/家居',
 '泳镜/泳帽/其它泳具',
 '数码贴膜',
 '洗洁精/油污清洁',
 '拖把',
 '睡袋',
 '地毯/地垫',
 '运动包',
 '台球相关',
 '鱿鱼/鱼干/海味',
 '锅刷/除油刷',
 '婴童/青少年家具',
 '游戏代练',
 '其他饰品相关',
 '历史',
 '电火锅/电蒸锅',
 '打底/保暖内衣',
 '电脑包',
 '运动球衣',
 '肉松/鱼松',
 '丝袜/连裤袜/打底裤',
 '女性器具',
 '女士单肩包',
 '胸部护理',
 '民族弹拨乐器',
 '洗液',
 '烫衣板

In [113]:
# i = 0
# for cat_id in dfp.iloc[i]['catIds'].split(","):
#     print(dfc[dfc['catId']==int(cat_id)]['category'])
dfr.head(5)

Unnamed: 0,userId,productId,rating,timestamp,title,comment
0,15905.0,452609,5.0,1380988800,很喜欢,"很好很强大,纸张超赞不是一般画册所能比拟的,图片很好,特点基本都表现了出来。物种很全"
1,94522.0,452609,5.0,1333123200,精品!,"买过很多本DK的百科全书类图书,这本书是我买过的最经典的一本!很厚的一本书,装帧很漂亮!内容..."
2,317087.0,452609,5.0,1346688000,不错!!!!!!!!!!,"没想到送货的时候正好是下雨天,而且书没有塑料包装。 但是送货人员还是很小心的,虽然外面的纸箱..."
3,1329103.0,452609,5.0,1386518400,很棒的一本书!,
4,502593.0,452609,5.0,1347638400,很值的一本书,"是本很值得收藏的书。 本身DK的eyewitness系列就很值得收藏,这套书都秉承着科普、图..."


In [114]:
dfp.head(34)

Unnamed: 0,productId,name,catIds
0,0,CSSMs Biology: Control in Cells and in Organis...,832476923
1,1,Treasure Island,83211011086
2,2,Collins Primary Dictionaries – Collins Junior ...,832476739
3,3,Partners in Crime,832476923
4,4,The Hobbit,83211011086
5,5,Caps for Sale Book and CD,832476923
6,6,The Gulag Archipelago Volume 3: An Experiment ...,832476923
7,7,150 Best Bathroom Ideas,832476923
8,8,The Forgotten Warrior,832476923
9,9,Free Fall,83211011086


In [106]:
cat_id

'923'

In [58]:
# yf_amazon
t1 = time.time()
f = "/Users/zeyesun/Documents/Data/raw/yf_amazon"
l2 = []
nexts = dict()
with open(f, "r", encoding="utf-8") as r:
    while True:
        line = r.readline()
        if not line:
            break
        line = line.strip("\n")
        idx = len(line) // 2
        prompt = line[:idx]
        answer = line[idx+1:]
        answers = [(answer, 1)]
#         answers = [(k.replace(" ", ""), int(v)) for (k, v) in sorted(item['comments'], key=lambda x: (int(x[1]), len(x[0])), reverse=True)]
        l2.append({"prompt": f"上联：{prompt}{tokenizer.sep_token}下联：", "answers": answers})
        length = len(answer)
        if length not in nexts:
            nexts[length] = list()
        nexts[length].append(answer)
t2 = time.time()
print(f"length: {len(l2)}, # different lengths: {len(nexts)}, time taken: {t2-t1} s")
for i, l in enumerate(l2):
    answer = l['answers'][0]
    length = len(answer[0])
    nexts_tmp = copy.deepcopy(nexts[length])
    nexts_tmp.remove(answer[0])
    false_answers_1 = [(fa, 0) for fa in random.sample(nexts_tmp, 2)]
    keys = set(nexts.keys())
    keys.remove(length)
    false_answers_2 = [(random.choice(nexts[key]), -1) for key in random.sample(keys, 2)]
    answers = [answer] + false_answers_1 + false_answers_2
    l['answers'] = sorted(answers, key=lambda x: x[1], reverse=True)
    if i % 1000 == 0:
        print(f"{i} samples processed, time taken: {time.time()-t2} s")
print(f"length: {len(l2)}, time taken: {time.time()-t2} s")

length: 774491, # different lengths: 32, time taken: 4.948148012161255 s


In [84]:
df.tail(10)

Unnamed: 0,title,question,reply,is_best
671618,"""8.#00,00&#59#44;""欧元是多少人民币",,按今日汇率#200欧元=60#44人民币,0
726683,"""8###7""是招行什么号",,您好，烦请进一步说明您的问题，以便我们帮您核查，若是指储蓄卡，目前我行较新发行的储蓄卡并没有...,1
413028,"""2017年信用卡申请哪家好",,若申请招行信用卡，当地有招行，18-60周岁，若可以提供身份证，工作证明和收入证明资料，可以...,1
565634,重*贷款哪家值得信赖？,有人知道吗,惠贷网很正规,1
565635,重*贷款哪家值得信赖？,有人知道吗,若您所在城市有招行，可通过招行网点尝试申请贷款，各贷款项目所需条件及申请材料有所不同，请您在...,0
565514,请问重庆哪里有无抵押贷款？,求详细的解答,银行贷款需要的条件：（1）年满18年周岁的具有完全民事行为能力、城镇居民常住户口或合法有效的...,1
565515,请问重庆哪里有无抵押贷款？,求详细的解答,宜信普惠的信用贷款，就是无抵押无担保的,0
565516,请问重庆哪里有无抵押贷款？,求详细的解答,贷款方法银行和其他公司都可以，主要看你的资质,0
528003,介绍几家保本型的网络理财产品？,能详细说下嘛,您好，若通过招行购买，请您登陆我行主页，点击理财产品(网页中间），查询一下，哪款理财产品比较...,1
528004,介绍几家保本型的网络理财产品？,能详细说下嘛,最近利息又降了,0


In [64]:
df.sort_values(by=["title", "is_best"], ascending=False, inplace=True)

In [68]:
df[~df['question'].isna()].iloc[100:150]

Unnamed: 0,title,question,reply,is_best
3034,魅蓝note5如何修改微信号第二次,魅蓝note5如何修改微信号第二次,您好，微信号只有一次设置的机会，您成功设置好微信号后是没有二次修改的机会的，如果需要修改微信...,1
55126,魅蓝note5如何使用双卡,魅蓝note5如何使用双卡,该机支持双卡双待，只需要您同时安装两张手机卡，然后即可正常使用双卡了。,1
102246,魅蓝note5在哪设置来电显示啊,魅蓝note5在哪设置来电显示啊,你好来电显示是运营商的一个业务，手机卡都有来显业务的，开通来显业务的手机卡在接到电话是手机上...,1
92221,魅蓝note5可以放几张SIM,魅蓝note5可以放几张SIM,该机是全网通版本双卡机型。支持使用两张手机卡。,1
119933,魅蓝note5只有听筒有外放,手机只有上方的可以放出声音，下方无声音，请问这是坏了吗？,"建议您在手机的声音选项下设置一下媒体音量。如果开启媒体音量后,还是没有声音,则可能是手机的扬...",1
119934,魅蓝note5只有听筒有外放,手机只有上方的可以放出声音，下方无声音，请问这是坏了吗？,建议您在手机的声音选项下设置一下媒体音量。如果开启媒体音量后，还是没有声音，则可能是手机的扬...,0
79843,魅蓝note5删除的缓存怎么恢复,魅蓝note5删除的缓存怎么恢复,已经删除的文件或者是缓存是无法恢复的，建议将重要的信息数据文件提起备份，防止误删或者是其他状...,1
79844,魅蓝note5删除的缓存怎么恢复,魅蓝note5删除的缓存怎么恢复,您好，已经删除的缓存是无法进行恢复的,0
36013,魅蓝note5不支持其他耳机接入吗,我插入苹果耳机，听音乐只有曲子，歌词微弱，看电影只有配乐对白几乎听不到，咋回事？,1、魅蓝note5的耳机孔会有点紧，需要用户全部插进去才可以正常使用的。2、非原装耳机出现的...,1
36014,魅蓝note5不支持其他耳机接入吗,我插入苹果耳机，听音乐只有曲子，歌词微弱，看电影只有配乐对白几乎听不到，咋回事？,该机支持所有3.5mm插口的耳机的您的这个情况，请检查手机的声音是设置，应该是声道设置问题。,0


In [6]:
# f = "/Users/zeyesun/Documents/Data/raw/baike_qa2019/baike_qa_train.json"
f = "D:\\Data\\raw\\baike_qa_train.json"
items = []
lens_prompt = []
lens_label = []
with open(f, "r", encoding="utf-8") as r:
    while True:
        line = r.readline()
        if not line:
            break
        item = json.loads(line.strip("\n"))
        prompt = clean_text(item['title'] if len(item['title']) > len(item['desc']) else item['desc'])
        label = clean_text(item['answer'])
        items.append(item)
        lens_prompt.append(len(prompt))
        lens_label.append(len(label))
print(len(items))
print(np.percentile(lens_prompt, np.arange(90, 101)))
print(np.percentile(lens_label, np.arange(90, 101)))

1425170


### prediction

In [5]:
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True, use_cache=False)
model.resize_token_embeddings(len(tokenizer.sp))
model.config.end_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
# model.config.max_length_prompt = 200
model.to(device)
# print(model.device)

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


GPTPanguForCausalLM(
  (transformer): GPTPanguModel(
    (wte): Embedding(40000, 2560)
    (wpe): Embedding(1024, 2560)
    (wqe): Embedding(1024, 2560)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPTPanguBlock(
        (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (attn): GPTPanguAttention(
          (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (c_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (mlp): GPTPanguMLP(
          (c_fc): Linear(in_features=2560, out_features=10240, bias=True)
          (c_proj): Linear(in_features=10240, out_featur

In [19]:
# checkpoint_files = "D:\\Data\\output\\sft\\pangu-350M\\checkpoint-12000\\pytorch_model.bin"
# checkpoint_files = "/Users/zeyesun/Documents/Data/output/sft/pangu-350M/checkpoint-12000/pytorch_model.bin"
checkpoint_files = "/Users/zeyesun/Documents/Data/output/sft/pangu-2.6B/checkpoint-9000/pytorch_model*.bin"
checkpoints = glob.glob(checkpoint_files)
st = dict()
for checkpoint in checkpoints:
    st.update(torch.load(checkpoint, map_location="cpu"))
model.load_state_dict(st)

<All keys matched successfully>

In [20]:
text_generator = TextGenerationPipeline(model, tokenizer, device=device)

The model 'GPTPanguForCausalLM' is not supported for . Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'GPT2LMHeadModel', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegatronBertForCausalLM', 'MvpForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead', 'RemBertForCausalLM', 'RobertaForCausalLM', 'RoFormerForCausalLM', 'Speech2Text2ForCausalLM', 'TransfoXLLMHeadModel', 'TrOCRForCausalLM', 'XGLMForCausalLM', 'XLMWithLMHeadModel', 'XLMProphetNetForCausalLM', 'XLMRobertaForCausalLM', 'XLMRobertaX

In [8]:
# f = "/Users/zeyesun/Documents/Data/raw/baike_qa2019/baike_qa_train.json"
f = "D:\\Data\\raw\\baike_qa_train.json"
i = 0
prompts = []
prompts_processed = []
labels = []
with open(f, "r", encoding="utf-8") as r:
    while True:
        line = r.readline()
        if not line:
            break
        item = json.loads(line.strip("\n"))
        prompt = clean_text(item['title'] if len(item['title']) > len(item['desc']) else item['desc'])
        label = clean_text(item['answer'])
        prompt_processed = prompt + tokenizer.sep_token + "模型回答:"
        prompts.append(prompt)
        prompts_processed.append(prompt_processed)
        labels.append(label)
        i += 1
        if i > 1000:
            break

In [14]:
num_return_sequences = 5
i = 10
j = 20
t1 = time.time()
results = text_generator(prompts_processed[i:j], max_length=200, num_return_sequences=num_return_sequences,
                         do_sample=True, top_k=50, temperature=10.0)
print(f"Finished prediction, time taken: {time.time()-t1}")

for prompt, res, label in zip(prompts[i:j], results[:(j-i)], labels[i:j]):
    print(f"prompt: {prompt}\nlabel: {label}")
    for k in range(num_return_sequences):
        model_answer = res[k]['generated_text'].split("模型回答:")[1].replace("<eot>", "").replace("<pad>", "")
        print(f"model answer-{k}: {model_answer}")
    print("\n\n")

prompt: 有6区无尽之海众神之子工会的吗?你们会怎么了?看不见人了,解散了 
label: 我听原众神成员说 众神的副会长 卷了工会财产跑路了(几千W呢)  所以导致工会解散 哎 真是人心陷恶   
model answer-0: 我也是无尽之海的,现在工会还没解散,是不可能解散了,不过我想你应该有很多个工会吧,应该有很多人在等着你,如果你有足够的耐心的话,可以去工会看看有多少人在等着你,我有个朋友就在等着我,我说的是在无尽之海,你知道的,要是没人的话,你就去工会看看,反正现在工会也没解散,你可以去公会看看,我也是在无尽之海的,在无尽之海也有很多人在等着你,要是你能坚持的话,我想你会更好的!结舌
model answer-1: 现在工会也解散了,是吧··タタタ瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄瞄
model answer-2: 你可以去看看万盛万盛万盛万盛万盛万盛万盛万盛万盛万盛万盛万盛万盛万盛万盛万盛万盛NdF万盛NdF
model answer-3: 不解散了,我是6区无尽之海的,我想知道,如果解散的话,工会的人和工会的人一起走,会怎么样?方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒方寒
model answer-4: 有,因为工会的人太多了,服务器都快封了。タ榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫榫



prompt: 老师好，

In [21]:
prompt = "内有吊车，牛腿高度怎么定？根据什么来定？生产的工艺要求？还是要考虑到其他因素？"
prompt_processed = prompt + tokenizer.sep_token + "模型回答:"
num_return_sequences=5
res = text_generator(prompt_processed, max_length=100, num_return_sequences=num_return_sequences,
                         do_sample=True, top_k=50, temperature=10.0)
print(f"prompt: {prompt}")
for i in range(num_return_sequences):
    model_answer = res[i]['generated_text'].split("模型回答:")[1].replace("<eot>", "").replace("<pad>", "")
    # print(res)
    print(f"model answer-{i}: {model_answer}")

Building prefix dict from the default dictionary ...
I0224 13:57:47.086069 140704479725184 __init__.py:113] Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/wf/12stcn_s6zq56j9h3fnkv5lm0000gn/T/jieba.cache
I0224 13:57:47.104202 140704479725184 __init__.py:133] Loading model from cache /var/folders/wf/12stcn_s6zq56j9h3fnkv5lm0000gn/T/jieba.cache
Loading model cost 0.565 seconds.
I0224 13:57:47.664287 140704479725184 __init__.py:165] Loading model cost 0.565 seconds.
Prefix dict has been built successfully.
I0224 13:57:47.666184 140704479725184 __init__.py:166] Prefix dict has been built successfully.


prompt: 内有吊车，牛腿高度怎么定？根据什么来定？生产的工艺要求？还是要考虑到其他因素？
model answer-0: 工艺性强了定在1.2的高。生产量要能到2。6车就行不生产也有1.6高度牛只等。一般要求1.0就能很明确可以计算:工艺高到多少比较科学(不能超过3的要考虑3吨以下汽车要超过1.0等要求)
model answer-1: 这是可以确定重量(1米60一米40多啊牛腿不是固定啊呵呵)当然这有很大差别呢主要工艺性决定,如要求速度很高就要设较佳速度来加工...呵呵了。
model answer-2: 牛吊(头向下弯曲至脚尖方向的过程均指同一吊轮形式而分开表示用轮、吊、牛三个量块计量不同头辐。下轮吊运时常由1对,8台1式起重机按一用一吊或两种方式排列配置完成为准
model answer-3: 先定一根长100/1*30厘米钢管和高70公厘米无缝钢管和厚4一13之间2组钢管做“架子钢骨柱基础架板基础或柱的下部混凝土(每只立柱混凝土20到1250加500就足够≥1)柱筋2和圈4根
model answer-4: 楼上有人是傻不隆科型吧.这个很不好设定啊如果把车架(如W5F和OYFL6ZXHB车架有不小)的强度用公式A2F1×hP定好车臂高/m在1和100之最小就足够大这样


# Reward Model

In [9]:
model_name_or_path = "D:\\Data\\models\\pangu-350M"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_cache=False, trust_remote_code=True)
tokenizer.add_special_tokens({'unk_token': "<unk>",
                                  'bos_token': "<s>",
                                  'eos_token': "<eot>",
                                  'pad_token': "<pad>",
                                  "sep_token": "<sep>"})

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


0

In [10]:
max_length = 1024
text = "你好，你是谁"
# text = "<|startoftext|>" + text + "<|endoftext|>"
res = tokenizer(text, max_length=max_length, truncation="longest_first", 
          return_tensors="pt", add_special_tokens=False)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\SUNZEY~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.501 seconds.
Prefix dict has been built successfully.


In [19]:
res.keys()
# torch.cat((res['input_ids'], res['input_ids']), axis=1)

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])