In [1]:
import os
import pandas as pd
import openai
from gpt_score.gpt3_score import gpt3score
from tqdm import tqdm
import random
import logging
import json
import re
import datetime

tqdm.pandas()

proxy = 'http://dell-1.star:7890' # 3090 docker
os.environ['http_proxy'] = proxy 
os.environ['HTTP_PROXY'] = proxy
os.environ['https_proxy'] = proxy
os.environ['HTTPS_PROXY'] = proxy

openai.api_key_path = ".openai-key2"

num_samples = 1000
date_str = datetime.datetime.now().strftime("%Y-%m-%d")

In [2]:
PROMPT_1_SHOT = """请将下述提问改写为更加口语化的形式，要求保持语义不变，保证提问与回复逻辑连贯，且提问形式更加符合日常口语习惯。以下是一些例子：
示例1
[提问1] {question1}
[回复1] {answer1}
[口语化提问1] {out_question1}

以下是需要改写的提问：
[提问] {question}
[回复] {answer}
[口语化提问]
"""

In [3]:
PROMPT_2_SHOT = """请将下述提问改写为更加口语化的形式，要求保持语义不变，保证提问与回复逻辑连贯，且提问形式更加符合日常口语习惯。以下是一些例子：
示例1
[提问1] {question1}
[回复1] {answer1}
[口语化提问1] {out_question1}

示例2
[提问2] {question2}
[回复2] {answer2}
[口语化提问2] {out_question2}

以下是需要改写的提问：
[提问] {question}
[回复] {answer}
[口语化提问]
"""

In [44]:
PROMPT_3_SHOT = """请将下述提问改写为更加口语化的形式，要求保持语义不变，保证提问与回复逻辑连贯，且提问形式更加符合日常口语习惯。以下是一些例子：
示例1
[提问1] {question1}
[回复1] {answer1}
[口语化提问1] {out_question1}

示例2
[提问2] {question2}
[回复2] {answer2}
[口语化提问2] {out_question2}

示例3
[提问3] {question3}
[回复3] {answer3}
[口语化提问3] {out_question3}

以下是需要改写的提问：
[提问] {question}
[回复] {answer}
[口语化提问]
"""

In [45]:
template = json.load(open("template/question_rewrite.json", "r"))

In [46]:
raw_qas = pd.read_json("data/linewell.json")
raw_qas.rename(columns={"instruction": "question", "output": "answer"}, inplace=True)
raw_qas.drop(columns=["input"], inplace=True)
raw_qas.head()

Unnamed: 0,question,answer
0,根据《宝安区关于促进先进制造业和现代服务业高质量发展的若干措施》中，我们公司想要申请宝安区的...,根据《宝安区关于促进先进制造业和现代服务业高质量发展的若干措施》第一条第三款的规定，综合整治...
1,我们公司是一家企业技术中心，是否符合《宝安区关于促进先进制造业和现代服务业高质量发展的若干措...,根据《宝安区关于促进先进制造业和现代服务业高质量发展的若干措施》第三十三条第二款，只要您的公...
2,我们公司获得了区引导基金及其子基金的投资，是否可以获得《宝安区关于促进先进制造业和现代服务业...,是的，根据第五十二条第三款，对获得区引导基金及其子基金投资的企业，可以获得不超过获投金额的3...
3,《宝安区2022年经济稳增长和助企纾困接续政策（深宝工信〔2022〕288 号）》，我们公司...,根据《宝安区2022年经济稳增长和助企纾困接续政策（深宝工信〔2022〕288 号）》第十条...
4,针对《宝安区关于创新引领发展的实施办法（修订版）（深宝规〔2020〕11号）》，我们公司如果...,根据《宝安区关于创新引领发展的实施办法（修订版）（深宝规〔2020〕11号）》，如果公司同时...


In [47]:
keywords = ["这"]
drop_indice = []
for i in range(len(raw_qas)):
    for keyword in keywords:
        if keyword in raw_qas.iloc[i]['question']:
            drop_indice.append(i)

print(len(drop_indice))
raw_qas = raw_qas.drop(drop_indice)
print(f"after drop: {len(raw_qas)}")

# count question length without strings between 《》
def count_question_length(question):
    if "《" in question and "》" in question:
        first_pos = question.index("《")
        last_pos = question.rindex("》")
        return len(question[:first_pos] + question[last_pos+1:])
    else:
        return len(question)

raw_qas['question_length'] = raw_qas['question'].apply(count_question_length)
raw_qas = raw_qas[raw_qas['question_length'] > 15]
print(f"after filter: {len(raw_qas)}")

100
after drop: 4472
after filter: 3783


In [52]:

sampled_df = raw_qas.sample(n=num_samples, random_state=1)
sampled_df.to_json(f'data/linewell_sampled_{num_samples}.jsonl', orient='records', lines=True, force_ascii=False)

In [14]:
evol_template = json.load(open("template/question_evol.json", "r"))
sampled_df = pd.read_json(f'data/linewell_sampled_{num_samples}.jsonl', orient='records', lines=True)

In [13]:
def aug_questions_by_chat(row, template, max_tokens=1000, num_shot=1):
    if num_shot == 1:
        example = random.choice(template)
        content = PROMPT_1_SHOT.format(
            question1=example["question"],
            answer1=example["answer"],
            out_question1=example["out_question"],
            question=row["question"],
            answer=row["answer"]
        )
    elif num_shot == 2:
        examples = random.sample(template, 2)
        content = PROMPT_2_SHOT.format(
            question1=examples[0]["question"],
            answer1=examples[0]["answer"],
            out_question1=examples[0]["out_question"],
            question2=examples[1]["question"],
            answer2=examples[1]["answer"],
            out_question2=examples[1]["out_question"],
            question=row["question"],
            answer=row["answer"]
        )
    else:
        raise ValueError("num_shot must be 1 or 2")
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {
                    "role": "user",
                    "content": content
                }
            ],
            temperature=0,
            max_tokens=max_tokens,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            stop=["\n\n"]
        )
        return response.choices[0].message.content
    except Exception as e:
        logging.error(e)
        return ""
    
def evol_aug(row):
    res = ""
    for asp in ["question_split", "synonym_replacement", "sentence_rewriting", "question_simplification"]:
        res += aug_questions_by_chat(row, evol_template[asp], num_shot=1)
    return res

In [16]:
# sampled_df["out_question"] = sampled_df.progress_apply(aug_questions_by_chat, axis=1, num_shot=2) # old version
sampled_df["out_question"] = sampled_df.progress_apply(evol_aug, axis=1) # new version with evol-instruct

 80%|███████▉  | 795/1000 [4:25:41<1:02:08, 18.19s/it] ERROR:root:The server is overloaded or not ready yet.
100%|██████████| 1000/1000 [5:29:57<00:00, 19.80s/it] 


In [17]:
# sampled_df.to_json("data/linewell_out.json", orient="records", lines=True, force_ascii=False)
valid_df = sampled_df[sampled_df["out_question"].apply(lambda x: len(x) > 0)]
valid_df.to_json(f"data/linewell_out_{num_samples}_{date_str}.jsonl", orient="records", lines=True, force_ascii=False)

In [10]:
# valid_df = pd.read_json(f"data/linewell_out_{num_samples}.jsonl", lines=True, orient="records")
# valid_df.rename(columns={"question": "instruction", "answer": "output"}, inplace=True)
# valid_df["input"] = ""
# valid_df = valid_df[["instruction", "input", "output"]]
# valid_df.to_json(f"data/linewell_out_{num_samples}.json", orient="records", force_ascii=False, indent=4)

In [19]:
valid_df = pd.read_json(f"data/linewell_out_{num_samples}_{date_str}.jsonl", lines=True, orient="records")
question_list, answer_list, old_question_list = [], [], []
for idx, row in valid_df.iterrows():
    aug_questions = row.out_question
    aug_questions = re.split(r"\d+\.", aug_questions)
    aug_questions = [q.strip() for q in aug_questions if q.strip()]
    question_list.extend(aug_questions)
    answer_list.extend([row.answer] * len(aug_questions))
    old_question_list.extend([row.question] * len(aug_questions))

new_df = pd.DataFrame({
    "question": question_list,
    "answer": answer_list,
})
# new_df = pd.concat([new_df, valid_df[["question", "answer"]]], axis=0)
new_df.to_json(f"data/linewell_out_{num_samples}_rewrite_{date_str}.jsonl", 
               lines=True, orient="records", force_ascii=False)

In [3]:
new_df = pd.read_json(f"data/linewell_out_{num_samples}_rewrite_{date_str}.jsonl", lines=True, orient="records")
new_df["input"] = ""
new_df.rename(columns={"question": "instruction", "answer": "output"}, inplace=True)

all_df = pd.read_json("data/政策问答sft_data-0727/sft_train_data.json")
all_df = pd.concat([all_df, new_df], ignore_index=True)

# shuffle data
all_df = all_df.sample(frac=1).reset_index(drop=True)

all_df.to_json(f"data/linewell_train_data_aug__{date_str}.json", force_ascii=False,
                indent=4, orient="records")

# Test Synonym Substitution

In [11]:
import json
import pandas as pd
from tqdm import tqdm
import random

tqdm.pandas()
random.seed(1)

In [21]:
synonyms = json.load(open('data/synonyms.json', 'r'))
json.dump(synonyms, open('data/synonyms.json', 'w'), indent=4, ensure_ascii=False)

# sort synonyms by key length with descending order
synonyms = sorted(synonyms.items(), key=lambda x: len(x[0]), reverse=True)
synonyms = dict(synonyms)

In [22]:
def check_and_sub_synonym(text, sys_dict=synonyms):
    for key in sys_dict:
        if key in text:
            text = text.replace(key, random.choice(sys_dict[key]))
    return text

In [23]:
aug_df = pd.read_json("data/linewell_out_1000_rewrite.jsonl", lines=True)

aug_df["sub_question"] = aug_df["question"].progress_apply(check_and_sub_synonym)
sub_df = aug_df[aug_df["sub_question"] != aug_df["question"]]

  0%|          | 0/3689 [00:00<?, ?it/s]

100%|██████████| 3689/3689 [00:03<00:00, 1007.14it/s]


In [24]:
sub_df = sub_df[["question", "sub_question", "answer"]]
sub_df.head()

Unnamed: 0,question,sub_question,answer
0,如果我们公司违反了《宝安区关于加大企业人才住房供应力度的实施办法》，会有什么处罚措施？,如其吾同龄人公物司犯忌矣《宝中央之宝安区至于加庞大企业材料宅邸支应力度之实心施办法》，会领有...,根据该实施办法，如果公司违反了相关规定，将会按照《深圳市人才安居办法》《深圳保障性住房条例》...
1,你好，如果我们公司没有按照相关规定执行，会有什么惩罚？,汝妙，而咱俩集体司罔负有按照整系决定执行，会负有啥事罚？,根据该实施办法，如果公司违反了相关规定，将会按照《深圳市人才安居办法》《深圳保障性住房条例》...
2,请问如果我们公司违反了相关规定，会受到什么样的处罚？,假灿光倘若俺曹公家司沾手犯矣辅轸相依节程，会受及啥样底判罚？,根据该实施办法，如果公司违反了相关规定，将会按照《深圳市人才安居办法》《深圳保障性住房条例》...
3,申请《2022年专业型工业互联网平台成长奖励》时，需要进行审计吗？,报名《2022夏苍穹专业型工业筹切莫失为着机台网平行台成材奖赏》点钟，要求启封拓审计与否？,是的，申请过程中需要进行专家现场核查和会计师审计，并出具审计报告作为申请的补充材料。
4,在申请过程中，是否需要进行专家现场核查和会计师审计？,每当报名登程适中，系否用登铺展土壤专户实心四方里边果皮查及会计砚砚审计？,是的，申请过程中需要进行专家现场核查和会计师审计，并出具审计报告作为申请的补充材料。
