# 单轮对话人工评估对比

## 切换工作目录

In [1]:
%cd ..

/home/Public/DialoGPT


## 引用模组

In [2]:
import csv
import json
import logging
import os
import re
import socket
import subprocess as sp
import sys
from functools import partial
from importlib import import_module
from os.path import abspath, dirname, exists, join
from types import SimpleNamespace

import numpy as np
import torch
import torch.nn.functional as F
from pytorch_pretrained_bert import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
from tqdm.auto import tqdm, trange

from demo_utils import download_model_folder
from env import END_OF_TEXT_TOKEN
from gpt2_training.train_utils import (boolean_string,
                                       fix_state_dict_namespace,
                                       get_eval_list_same_length, load_model)

## 定义全局函数

### function of generate/filter/sample

In [3]:

def cut_seq_to_eos(sentence, eos_id, remove_id=[-1]):
    sent = []
    for s in sentence:
        if s in remove_id:
            continue
        if s != eos_id:
            sent.append(s)
        else:
            break
    return sent


# FROM HUGGING FACE REPO
def top_filtering(logits, top_k=0, top_p=0.0, threshold=-float('Inf'), filter_value=-float('Inf')):
    """ Filter a distribution of logits using top-k, top-p (nucleus) and/or threshold filtering
        Args:
            logits: logits distribution shape (vocabulary size)
            top_k: <=0: no filtering, >0: keep only top k tokens with highest probability.
            top_p: <=0.0: no filtering, >0.0: keep only a subset S of candidates, where S is the smallest subset
                whose total probability mass is greater than or equal to the threshold top_p.
                In practice, we select the highest probability tokens whose cumulative probability mass exceeds
                the threshold top_p.
            threshold: a minimal threshold to keep logits
    """
    assert logits.dim() == 1  # Only work for batch size 1 for now - could update but it would obfuscate a bit the code
    top_k = min(top_k, logits.size(-1))
    if top_k > 0:
        # Remove all tokens with a probability less than the last token in the top-k tokens
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value

    if top_p > 0.0:
        # Compute cumulative probabilities of sorted tokens
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probabilities = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        # Remove tokens with cumulative probability above the threshold
        sorted_indices_to_remove = cumulative_probabilities > top_p
        # Shift the indices to the right to keep also the first token above the threshold
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        # Back to unsorted indices and set them to -infinity
        indices_to_remove = sorted_indices[sorted_indices_to_remove]
        logits[indices_to_remove] = filter_value

    indices_to_remove = logits < threshold
    logits[indices_to_remove] = filter_value
    return logits


def generate_next_token(model, input_ids, position_ids=None, token_type_ids=None, prev=None, temperature=1, top_k=0, top_p=0, past=None):
    with torch.no_grad():
        if not past:
            hidden_states, past = model.transformer(prev, position_ids, token_type_ids, past=past)
        else:
            hidden_states, past = model.transformer(prev, past=past)
        logits = model.lm_head(hidden_states)
        logits = logits[0, -1, :] / temperature
        logits = top_filtering(logits, top_k=top_k, top_p=top_p)
        probs = F.softmax(logits.unsqueeze(0), dim=-1)
        prev = torch.multinomial(probs, num_samples=1)
        return prev, probs[0][prev], past


def generate_sequence(model, input_ids, position_ids=None, token_type_ids=None, temperature=1, top_k=0, top_p=0, length=20, past=None, device='cuda', eos_id=None):
    output = input_ids.new_zeros([input_ids.size(0), 0])
    prev = input_ids
    for i in range(length):
        prev, probs, past = generate_next_token(
            model, input_ids, position_ids,
            token_type_ids, prev, temperature, top_k, top_p, past
        )
        if eos_id is not None:
            tokens = prev[0].cpu()
            if tokens[0] == eos_id:
                break
        output = torch.cat((output, prev), dim=1)
    return output


### function of load tokenizer and model

In [4]:
def get_tokenizer(args):
    tokenizer_class = args.tokenizer_class.strip()
    module_name, class_name = tokenizer_class.split(':')
    mod = import_module(module_name)
    clz = getattr(mod, class_name)
    return clz.from_pretrained(args.tokenizer_model)


def load_tokenizer_and_model(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    args.device, args.n_gpu = device, n_gpu

    np.random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    # load tokenizer
    print('load tokenizer ...')
    tokenizer = get_tokenizer(args)  # GPT2Tokenizer.from_pretrained(args.model_name_or_path)
    eos_id = tokenizer.encode(END_OF_TEXT_TOKEN)[-1]
    print('EOS: {:,d}'.format(eos_id))

    # load the GPT-2 model
    print('load the GPT-2 model ...')
    config = GPT2Config.from_json_file(os.path.join(args.model_name_or_path, 'config.json'))
    model = load_model(GPT2LMHeadModel(config), args.load_checkpoint, args, verbose=True)
    print('load the GPT-2 model ok.')

    model.to(device)
    model.eval()
    
    return tokenizer, model, device


### function of sigle turn dialog eval

In [5]:
def generate_reply(tokenizer, model, device, history, args):
    if isinstance(history, str):
        history = [history]
    assert all(isinstance(i, str) for i in history)
    
    eos_id = tokenizer.encode(END_OF_TEXT_TOKEN)[-1]
    
    context_tokens = sum([tokenizer.encode(h) + [eos_id] for h in history], [])  # + [eos_id]
    context_tokens = torch.tensor(context_tokens, device=device, dtype=torch.long).unsqueeze(0)
    position_ids = torch.arange(0, context_tokens.size(-1), dtype=torch.long, device=context_tokens.device)

    out = generate_sequence(
        model, context_tokens, position_ids=position_ids,
        length=args.generation_length, temperature=args.temperature,
        top_k=args.top_k, top_p=args.top_p,
        eos_id=eos_id
    )

    out = out.tolist()
    text = tokenizer.decode(out[0])

    history.append(text)
    return history


## 参数默认值

In [6]:
args = SimpleNamespace(
    seed=42,
    tokenizer_class=None,
    tokenizer_model=None,
    model_name_or_path=None,
    load_checkpoint=None,
    fp16=True,
    max_seq_length=1024,
    generation_length=256,
    temperature=0.7,
    top_k=0,
    top_p=0.9
)

## 运行参数

In [7]:
args.model_name_or_path = '/home/Public/DialoGPT/output_model/345m-hmwebmix-bpe-32k-v2/pre-train-345m-hmwebmix-bpe-32k-v2'
args.load_checkpoint = '/home/Public/DialoGPT/output_model/345m-hmwebmix-bpe-32k-v2/GP2-pretrain-step-70000.pkl'

args.tokenizer_class = 'tokenizers.tokenization_cn:GPT2BPETokenizer_CN'
args.tokenizer_model = '/home/Public/DialoGPT/output_model/345m-hmwebmix-bpe-32k-v2/pre-train-345m-hmwebmix-bpe-32k-v2'

In [8]:
tokenizer, model, device = load_tokenizer_and_model(args)

load tokenizer ...
EOS: 6
load the GPT-2 model ...
load the GPT-2 model ok.


## 执行

从测试集进行单轮对话。

数据格式:

- tsv
- 两列
  - 第一列: A 的提问
  - 第二列: B 的回答

In [9]:
test_data_file = '/home/Public/data/transfer-learning/output/output-qa/xinli001_jiandanxinli-qa.topics_去重_DialogGPT/test_0_DialogGPT.tsv'

!wc -l {test_data_file}

9108 /home/Public/data/transfer-learning/output/output-qa/xinli001_jiandanxinli-qa.topics_去重_DialogGPT/test_0_DialogGPT.tsv


In [None]:
total = 500

with open(test_data_file) as csvfile, \
     open('out/test-result.70k.json', 'w') as fp:
    reader = csv.reader(csvfile, delimiter='\t')
    for _, row in tqdm(zip(range(total), reader), total=500):
        history = row[0]
        histsory = generate_reply(tokenizer, model, device, history, args)
        reply = histsory[-1]
        data = {
            'question': row[0],
            'generate': reply,
            'answer': row[1],
        }
        print(json.dumps(data, ensure_ascii=False), file=fp)
        

HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))