# Build QA question dataset

In [1]:
import pandas as pd
from transformers import AutoTokenizer
import json

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
df = pd.read_csv("liantongzhidao_filter.csv", keep_default_na=False)
df.head()

Unnamed: 0,title,question,reply,is_best
0,联通20块钱14个G的流量，是真的吗,,2018年6月1日-6月21日期间，联通推出端午特惠流量包，20元7G国内+7G省内流量，7...,0
1,*内联通手机怎么给泰*打电话,,国内联通手机拨打国外电话，首先需要开通国际长途功能。开通后，使用手机的拨号方式为：00+国家...,0
2,联通卡月底开通冰淇淋不限流量套餐怎么收费,联通卡月底开通冰淇淋不限流量套餐怎么收费,月底开通是按量计费的，次月开始执行套餐费用,0
3,燕*福成六期能装联通宽带吗,燕*福成六期能装联通宽带吗,建议您可以携带本人身份证到当地的营业厅预约宽带业务，如果不方便，可以通过中*联通网上营业厅申...,0
4,流量用完都怎么办了,,如果是流量用了，你可以使用手机营业厅进行购买流量包，也可以使用网上营业厅或进入对应客服中心咨...,0


In [17]:
df_best = df[df["is_best"] == 1]
df_best.head()

Unnamed: 0,title,question,reply,is_best
7,一夜了流量还没到账.也找不到客服，怎么办,,如果流量未到账，一般是因为未到赠送时间或者号码状态不正常如欠费、停机导致流量无法到账的。建议...,1
9,以知一张联通号码卡的归属地是福*和后5位数的号码（65***）求这张卡的全部号码可能是哪些,,您好，首先联通是有保护用户隐私的机制的，其次福*的号码即使知道归属地和尾号，中间号段也不止一...,1
14,哪个平台手机充值话费优惠一些,最好是免费,您好，如果您的号码是联通号码，每月的28日为中*联通“支付日”，在这天，使用“沃钱包”消费支...,1
16,日版无锁5sa##53怎么破解联通电信4G！！,日版无锁5sa##53怎么破解联通电信4G！！系统版本为ios7日版无锁A##53有没大神有...,第一步：刷入联通IPCC1、准备ipcc文件(本帖附下载)2、了解自己的是多少位数的系统，右...,1
17,送的宽带账号和密码是多少,,山*联通宽带用户查询账号、密码，可通过核实证件号码、客户姓名、装机地址核对宽带账号，也可充值...,1


In [18]:
df_non_best = df[df["is_best"] == 0]
df_non_best.head()

Unnamed: 0,title,question,reply,is_best
0,联通20块钱14个G的流量，是真的吗,,2018年6月1日-6月21日期间，联通推出端午特惠流量包，20元7G国内+7G省内流量，7...,0
1,*内联通手机怎么给泰*打电话,,国内联通手机拨打国外电话，首先需要开通国际长途功能。开通后，使用手机的拨号方式为：00+国家...,0
2,联通卡月底开通冰淇淋不限流量套餐怎么收费,联通卡月底开通冰淇淋不限流量套餐怎么收费,月底开通是按量计费的，次月开始执行套餐费用,0
3,燕*福成六期能装联通宽带吗,燕*福成六期能装联通宽带吗,建议您可以携带本人身份证到当地的营业厅预约宽带业务，如果不方便，可以通过中*联通网上营业厅申...,0
4,流量用完都怎么办了,,如果是流量用了，你可以使用手机营业厅进行购买流量包，也可以使用网上营业厅或进入对应客服中心咨...,0


# V1 版思路

In [28]:
MIN_TOKEN_LENGTH = 10

tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")

def get_num_token(text: str):
    return len(tokenizer.tokenize(text))

def to_propmt(df):
    df["prompt"] = df["title"] + df["question"]
    df.rename(columns={"reply": "completion"}, inplace=True)

    token_num = 0
    prompt_dict = {}
    for row in df.iterrows():
        prompt = row[1]["prompt"]
        completion = row[1]["completion"]
        # drop prompt or completion with * or #
        if "*" in prompt or "#" in prompt or "*" in completion or "#" in completion:
            continue

        # drop prompt or completion less than MIN_TOKEN_LENGTH tokens on 
        prompt_token = get_num_token(prompt)
        completion_token = get_num_token(completion)
        if prompt_token < MIN_TOKEN_LENGTH or completion_token < MIN_TOKEN_LENGTH:
            continue

        # merge duplicate prompt
        if prompt not in prompt_dict:
            token_num += prompt_token + completion_token
            prompt_dict[prompt] = [completion]
        else:
            token_num += completion_token
            prompt_dict[prompt].append(completion)

    print(f"Total token number: {token_num}")
    with open("liantongzhidao_prompt.jsonl", "w") as f:
        for key, value in prompt_dict.items():
            if len(value) == 1:
                completion = value[0]
            else:
                completion = ""
                for idx in range(len(value)):
                    completion += f"{idx + 1}. {value[idx]}\n"
            f.write(
                json.dumps({
                    "prompt": key,
                    "completion": completion
                }, ensure_ascii=False) + "\n"
            )

to_propmt(df)

Token indices sequence length is longer than the specified maximum sequence length for this model (590 > 512). Running this sequence through the model will result in indexing errors


Total token number: 8005128


# V2 0607

In [7]:
sub_rules = {
    "中*联通": "中国联通",
    "中*联*手*营*厅": "中国联通手机营业厅",
    "中*联*网*上*营*业*厅": "中国联通网上营业厅", 
    "联*": "联通",
    "腾讯王*": "腾讯王卡",
    "*过": "通过",
    "*常": "通常",
    "网上营*厅": "网上营业厅",
    "联*实*营*厅": "联通实体营业厅",
    "网上营*厅": "网上营业厅",
    "联*网*营*厅": "联通网上营业厅",
    "大*卡": "大王卡",
    "*际长途": "国际长途",
    "联*手*营*厅": "联通手机营业厅",
    "v*v*": "vivo",
    "iP##ne": "iPhone",
    "归*地*通*业*": "归属地联通营业厅",
    "ip##ne": "iphone",
    "联*号*": "联通号码",
    "王*助手": "王卡助手",
    "联*营*厅": "联通营业厅",
    "王*": "王卡",
    "王*荣耀": "王者荣耀",
    "王*荣*": "王者荣耀",
    "1##86": "10086",
    "营*厅": "营业厅"
}

# sort by key length
sub_rules = {k: sub_rules[k] for k in sorted(sub_rules, key=len, reverse=True)}

In [14]:
MIN_TOKEN_LENGTH = 10

tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)

def get_num_token(text: str):
    return len(tokenizer.tokenize(text))

def get_num_token_df(df: pd.DataFrame):
    return df["prompt_token_num"].sum() + df["completion_token_num"].sum()

def sub_sensitive(text: str):
    for k, v in sub_rules.items():
        text = text.replace(k, v)
    return text

def to_propmt(df):
    
    df["prompt"] = df["title"] + df["question"]
    df.rename(columns={"reply": "completion"}, inplace=True)
    df["prompt_token_num"] = df["prompt"].apply(get_num_token)
    df["completion_token_num"] = df["completion"].apply(get_num_token)

    print(f"Original num: {len(df)}")
    print(f"Original token number: {get_num_token_df(df)}")

    # filter best reply
    df = df[df["is_best"] == 1]
    print(f"Best reply num: {len(df)}")
    print(f"Best reply token number: {get_num_token_df(df)}")

    # filter prompt or completion less than MIN_TOKEN_LENGTH tokens
    drop_idx = []
    for idx, row in df.iterrows():
        if row["prompt_token_num"] < MIN_TOKEN_LENGTH or row["completion_token_num"] < MIN_TOKEN_LENGTH:
            drop_idx.append(idx)
    df = df.drop(drop_idx)
    print(f"Dropped num: {len(df)}")
    print(f"Dropped token number: {get_num_token_df(df)}")

    # recover sensitive words
    df["prompt"] = df["prompt"].apply(sub_sensitive)
    df["completion"] = df["completion"].apply(sub_sensitive)

    # drop prompt or completion with * or #
    df = df[~df["prompt"].str.contains("\*|#")]
    df = df[~df["completion"].str.contains("\*|#")]
    print(f"Drop * or # num: {len(df)}")
    print(f"Drop * or # token number: {get_num_token_df(df)}")

    with open("liantongzhidao_prompt.jsonl", "w") as f:
        for row in df.iterrows():
            prompt = row[1]["prompt"]
            completion = row[1]["completion"]
            f.write(
                json.dumps({
                    "prompt": prompt,
                    "completion": completion
                }, ensure_ascii=False) + "\n"
            )

to_propmt(df)

Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


Original num: 203235
Original token number: 15844411
Best reply num: 117337
Best reply token number: 11226606
Dropped num: 83309
Dropped token number: 8182062
Drop * or # num: 37875
Drop * or # token number: 3222728


In [10]:
df_tmp = pd.read_csv("liantongzhidao_tmp.csv", keep_default_na=False)
drop_idx = []

for idx, row in df_tmp.iterrows():
    if '*' in row["prompt"] or '#' in row["prompt"] or '*' in row["completion"] or '#' in row["completion"]:
        continue
    drop_idx.append(idx)

df_sensitive = df_tmp.drop(drop_idx)
df_sensitive.to_csv("liantongzhidao_sensitive.csv", index=False)

In [11]:
def sub_sensitive(text: str):
    for k, v in sub_rules.items():
        text = text.replace(k, v)
    return text

df_sensitive["prompt"] = df_sensitive["prompt"].apply(sub_sensitive)
df_sensitive["completion"] = df_sensitive["completion"].apply(sub_sensitive)

# count row without * or #
non_sensitive_num = 0
for idx, row in df_sensitive.iterrows():
    if '*' in row["prompt"] or '#' in row["prompt"] or '*' in row["completion"] or '#' in row["completion"]:
        continue
    non_sensitive_num += 1
print(f"Non sensitive num: {non_sensitive_num}")
    

Non sensitive num: 4282
