In [None]:
import os, time, re, random, glob, json, jieba, copy
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoModelForMultipleChoice,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    default_data_collator,
    TextGenerationPipeline
)

device="cuda:0" if torch.cuda.is_available() else "cpu"
from sys import platform
if platform == "linux" or platform == "linux2":
    # linux
    root = "/mnt/sfevol775196/sunzeye273/Data"
#     root = "/mnt/pa002-28359-vol543625-private/Data"
#     root = "/root/autodl-tmp/Data"
elif platform == "darwin":
    # OS X
    root = "/Users/zeyesun/Documents/Data"
elif platform == "win32":
    # Windows...
    root = "D:\\Data"

In [None]:
CLEAN_TEXT_PATTERN = re.compile(r"[\r\n]")

def clean_text(text):
    return CLEAN_TEXT_PATTERN.sub("", text)

In [None]:
# model_name = "pangu-350M"
# model_name = "glm-350M-chinese"
model_name = "chatglm-6B"
model_name_or_path = os.path.join(root, "models", model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_cache=False, trust_remote_code=True)
print(tokenizer.special_tokens_map)
print(tokenizer.all_special_ids)
print(
    f"unk: {tokenizer.unk_token_id}\n",
    f"pad: {tokenizer.pad_token_id}\n",
    f"bos: {tokenizer.bos_token_id}\n",
    f"eos: {tokenizer.eos_token_id}\n",
    f"sep: {tokenizer.sep_token_id}\n",
    f"mask: {tokenizer.mask_token_id}\n",
#     f"eop: {tokenizer.eop_token_id}\n"
#     f"sop: {tokenizer.sop_token_id}\n"
#     f"cls: {tokenizer.cls_token_id}"
) 

In [None]:
if "glm" in model_name_or_path:
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path, trust_remote_code=True)
    if "chatglm" in model_name_or_path:
        model = model.half()
else:
    model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True, use_cache=False)
    model.resize_token_embeddings(tokenizer.vocab_size)
model.to(device)
print(model.device)

# SFT Prediction

In [None]:
checkpoint_files = os.path.join(root, "chatgpt", "output", "sft", "pangu-350M", "checkpoint-57043", "pytorch_model*.bin")
# checkpoint_files = os.path.join(root, "chatgpt", "output", "sft", "pangu-2.6B", "external_checkpoint-9000", "pytorch_model*.bin")
checkpoints = glob.glob(checkpoint_files)
st = dict()
for checkpoint in checkpoints:
    st.update(torch.load(checkpoint, map_location="cpu"))
model.load_state_dict(st)

In [None]:
max_length = 512
max_length_generation = 50
num_return_sequences = 1
top_p = 0.8
temperature = 0.8
# prompt = "写一篇歌颂祖国的文章"
prompt = '今天晚上我在睡觉.........他想要做那些事..我就大大声骂他"不要吵我睡觉"!!!!!...他就跑出去了...还不接我电话'
# prompt = "上联：东风执笔点龙睛，看幸福指数，天天向上"
# prompt = "中美关系进一步恶化"
# prompt = """阅读文章：《战国无双3》（）是由光荣和ω-force开发的战国无双系列的正统第三续作。本作以三大故事为主轴，分别是以武田信玄等人为主的《关东三国志》，织田信长等人为主的《战国三杰》，石田三成等人为主的《关原的年轻武者》，丰富游戏内的剧情。此部份专门介绍角色，欲知武器情报、奥义字或擅长攻击类型等，请至战国无双系列1.由于乡里大辅先生因故去世，不得不寻找其他声优接手。从猛将传 and Z开始。2.战国无双 编年史的原创男女主角亦有专属声优。此模式是任天堂游戏谜之村雨城改编的新增模式。本作中共有20张战场地图（不含村雨城），后来发行的猛将传再新增3张战场地图。但游戏内战役数量繁多，部分地图会有兼用的状况，战役虚实则是以光荣发行的2本「战国无双3 人物真书」内容为主，以下是相关介绍。（注：前方加☆者为猛将传新增关卡及地图。）合并本篇和猛将传的内容，村雨城模式剔除，战国史模式可直接游玩。主打两大模式「战史演武」&「争霸演武」。系列作品外传作品\n问：《战国无双3》是由哪两个公司合作开发的？"""
# prefix = "下联："
prefix = "答："
while True:
    if "chatglm" in model_name_or_path:
        encoded_prompt = tokenizer(prompt, prefix + tokenizer.mask_token)
        prompt_length = len(encoded_prompt['input_ids'])
        inputs = tokenizer(prompt, prefix + tokenizer.mask_token,
#                            max_length=max_length - max_length_generation,
#                            padding="max_length",
                           max_length=min(prompt_length, max_length),
                           truncation="only_first",
                           return_tensors="pt",
                           return_token_type_ids=False)
        max_length_generation = max_length - inputs['input_ids'].shape[1]
#         inputs_glm = tokenizer.build_inputs_for_generation(inputs, 
#                                                            max_gen_length=max_length_generation, padding=True)
        inputs_glm = inputs_glm.to(device)
        outputs = model.generate(**inputs_glm,
                                 max_new_tokens=max_length_generation,
                                 eos_token_id=tokenizer.eop_token_id,
                                 pad_token_id=tokenizer.pad_token_id,
                                 do_sample=False,
                                 num_return_sequences=num_return_sequences,
                                 top_p=top_p,
                                 temperature=temperature)
    elif "glm" in model_name_or_path:
        encoded_prompt = tokenizer(prompt, prefix + tokenizer.mask_token)
        prompt_length = len(encoded_prompt['input_ids'])
        inputs = tokenizer(prompt, prefix + tokenizer.mask_token,
#                            max_length=max_length - max_length_generation,
#                            padding="max_length",
                           max_length=min(prompt_length, max_length),
                           truncation="only_first",
                           return_tensors="pt",
                           return_token_type_ids=False)
        max_length_generation = max_length - inputs['input_ids'].shape[1]
        inputs_glm = tokenizer.build_inputs_for_generation(inputs, 
                                                           max_gen_length=max_length_generation, padding=True)
        inputs_glm = inputs_glm.to(device)
        outputs = model.generate(**inputs_glm,
                                 max_new_tokens=max_length_generation,
                                 eos_token_id=tokenizer.eop_token_id,
                                 pad_token_id=tokenizer.pad_token_id,
                                 do_sample=False,
                                 num_return_sequences=num_return_sequences,
                                 top_p=top_p,
                                 temperature=temperature)
    else:
        inputs = tokenizer(prompt, tokenizer.sep_token + prefix, 
                           max_length=max_length, 
                           return_tensors="pt",
                           truncation="only_first", 
#                            padding="max_length",
                           add_special_tokens=False,
                           return_token_type_ids=False)
        inputs = inputs.to(device)
        outputs = model.generate(**inputs,
                                 max_new_tokens=max_length_generation,
                                 pad_token_id=tokenizer.pad_token_id,
                                 do_sample=False,
                                 num_return_sequences=num_return_sequences,
                                 top_p=top_p,
                                 temperature=temperature)
    results = tokenizer.batch_decode(outputs, skip_special_tokens=True)
#     results = [result.split(prefix, maxsplit=1)[1] for result in results]
    print(results)
    break

In [None]:
# f = os.path.join(root, "raw", "baike_qa_train.json")
# [baike_qa.jsonl, chinese_classical.jsonl, chinese_poetry.jsonl, couplets.jsonl, weibo_summary_comments.jsonl, zhidao.jsonl]
f = os.path.join(root, "chatgpt", "processed", "baike_qa.jsonl")
i = 0
prompts = []
prompts_processed = []
labels = []
with open(f, "r", encoding="utf-8") as r:
    while True:
        line = r.readline()
        if not line:
            break
        item = json.loads(line.strip("\n"))
        # prompt = clean_text(item['title'] if len(item['title']) > len(item['desc']) else item['desc'])
        # prompt_processed = prompt + tokenizer.sep_token + prefix
        # label = clean_text(item['answer'])
        prompt = item['prompt']
        prompt_processed = prompt
        label = item['answers'][0]['answer']
        prompts.append(prompt)
        prompts_processed.append(prompt_processed)
        labels.append(label)
        i += 1
        # if i > 1000:
        #     break
random.shuffle(prompts_processed)
print(len(prompts_processed))

In [None]:
i = 79
num_return_sequences = 2
max_length = 512
max_length_generation = 100
top_k = 50
top_p = 0.8
temperature = 1.0
t1 = time.time()
prompt = prompts_processed[i]
inputs = tokenizer(prompt, add_special_tokens=False, return_token_type_ids=False, return_tensors="pt")
inputs = inputs.to(device)
outputs = model.generate(**inputs,
                         max_new_tokens=max_length_generation,
                         pad_token_id=tokenizer.pad_token_id,
                         do_sample=True,
                         num_return_sequences=num_return_sequences,
                         # top_p=top_p,
                         top_k=top_k,
                         temperature=temperature)
results = tokenizer.batch_decode(outputs, skip_special_tokens=True)
for result in results:
    # result.split(prefix, maxsplit=1)[1]
    print(result)


# results = text_generator(prompts_processed[i:j], max_length=200, num_return_sequences=num_return_sequences,
#                          do_sample=True, top_k=50, temperature=10.0)
# print(f"Finished prediction, time taken: {time.time()-t1}")

# for prompt, res, label in zip(prompts[i:j], results[:(j-i)], labels[i:j]):
#     print(f"prompt: {prompt}\nlabel: {label}")
#     for k in range(num_return_sequences):
#         model_answer = res[k]['generated_text'].split(prefix)[1].replace("<eot>", "").replace("<pad>", "")
#         print(f"model answer-{k}: {model_answer}")
#     print("\n\n")

# SFT Training

In [None]:
# prompt = "你是谁"
# prefix = "答:"
# label = "我是***，很高兴为你服务"
prompt = """倍数金额： 1倍，￥1024 元 场次 主---------客队 投注选项- 参考赔率---投注选项胜 平 负---
第一次为基本面投注---第二次为通过处理后投注  1 伯明翰 VS -----维冈 31-----1.93 3.27 3.87 ---  2伯恩利VS---朴茨茅30----- ---3 博尔顿 VS -----狼队
3------1.94 3.25 3.88 ---  4 斯托克 VS ---阿森纳 0------5.03 3.47 1.68 ---  5 门兴 VS -----弗赖堡 31-----1.77 3.倍数金额： 1倍，￥1024 元 场次
主---------客队 投注选项- 参考赔率---投注选项胜 平 负--- 第一次为基本面投注---第二次为通过处理后投注  1 伯明翰 VS -----维冈 31-----1.93 3.27 3.87 ---
2伯恩利VS---朴茨茅30----- ---3 博尔顿 VS -----狼队 3------1.94 3.25 3.88 ---  4 斯托克 VS ---阿森纳 0------5.03 3.47 1.68 ---  5 门兴 VS -----弗赖堡
31-----1.77 3.39 4.43 ---  6 美因兹 VS ---不来梅 10-----3.76 3.34 1.92 ---  7波鸿VS-----纽伦堡30----- ---8 斯图加 VS ---法兰克 31-----1.59 3.62 5.47
---  9 赫塔 VS -----霍芬海 30-----2.49 3.19 2.69 ---  10 勒沃 VS ------科隆 3------1.35 4.44 8.31 ---  11卡塔尼VS----巴里31----- ---12 拉齐奥 VS
--佛罗伦 31-----2.35 3.05 3.01 ---  13 特内里 VS ----皇马 0------9.43 4.95 1.29 ---  14 巴萨 VS ----马拉加 3------1.15 6.78 15.49 --"""
prefix = "回答："
label = "你出的赔率数据太早了，数据随时都会变化，这就是所谓要看临盘的道理，目前的数据没什么参考价值。"
max_length = 512
encoded_prompt = tokenizer(prompt, prefix + tokenizer.mask_token)
prompt_length = len(encoded_prompt['input_ids'])
label_length = len(tokenizer.tokenize(label)) + (1 if "chatglm" not in model_name_or_path else 0)
# print(f"prompt length: {prompt_length}, label length: {label_length}")
if prompt_length + label_length > max_length:
    num_tokens_to_remove = prompt_length + label_length - max_length
    for _ in range(num_tokens_to_remove):
        if prompt_length > label_length:
            prompt_length -= 1
        else:
            label_length -= 1
else:
    label_length = max_length - prompt_length
assert prompt_length > 0
assert label_length > 0
assert prompt_length + label_length <= max_length
encoded_dict = tokenizer(prompt, prefix + tokenizer.mask_token,
                         max_length=prompt_length, truncation="only_first",
                         return_tensors="pt", return_attention_mask=True)
inputs = tokenizer.build_inputs_for_generation(encoded_dict, targets=label,
                                               max_gen_length=label_length, padding=True)


In [None]:
print(tokenizer.special_tokens_map)
print(tokenizer.all_special_ids)
print(
    f"unk: {tokenizer.unk_token_id}\n",
    f"pad: {tokenizer.pad_token_id}\n",
    f"bos: {tokenizer.bos_token_id}\n",
    f"eos: {tokenizer.eos_token_id}\n",
    f"sep: {tokenizer.sep_token_id}\n",
    f"mask: {tokenizer.mask_token_id}\n",
#     f"eop: {tokenizer.eop_token_id}\n"
#     f"sop: {tokenizer.sop_token_id}\n"
#     f"cls: {tokenizer.cls_token_id}"
) 

In [None]:
print(tokenizer.convert_ids_to_tokens([20006]))
print(tokenizer.convert_ids_to_tokens([20012]))

In [None]:
print(encoded_prompt)
print(tokenizer.decode(encoded_prompt['input_ids']))
print(encoded_dict)
print(tokenizer.batch_decode(encoded_dict['input_ids']))

In [None]:
for key, val in inputs.items():
    print(f"{key} shape: {val.shape}")

In [None]:
for key, val in inputs_glm.items():
    print(f"{key} shape: {val.shape}")

In [None]:
print(inputs_glm['input_ids'][:20])

In [None]:
print(inputs_glm['labels'][:20])

In [None]:
print(inputs_glm['attention_mask'][0][9])

In [None]:
print(inputs_glm['position_ids'][0][:20])
print(inputs_glm['position_ids'][1][:20])

In [None]:
# st = model.state_dict()
st.keys()

In [None]:
print(st['transformer.word_embeddings.weight'].dtype)
print(st['transformer.layers.0.input_layernorm.weight'].dtype)

In [None]:
dtypes = dict()
for key, val in st.items():
    if val.dtype not in dtypes:
        dtypes[val.dtype] = list()
    dtypes[val.dtype].append(key)
print(dtypes.keys())


In [None]:
output = model(**inputs)