# QA augmentation by rewriting questions

In [1]:
import os
import openai

proxy = 'http://dell-1.star:7890' # 3090 docker
# proxy = 'http://127.0.0.1:7890' # clash
# proxy = 'http://127.0.0.1:1080' # naiveproxy

os.environ['http_proxy'] = proxy 
os.environ['HTTP_PROXY'] = proxy
os.environ['https_proxy'] = proxy
os.environ['HTTPS_PROXY'] = proxy
openai.api_key_path = ".openai-key2"

In [2]:
import json
with open("template/question_augmentation.json", "r") as f:
    template = json.load(f)


AUGMENT_PROMPT = """根据以下示例问题和答案，改写给定问题，不需要改写答案，要求改写后的问题与原问题的意思相同，且改写后的问题与给定答案匹配，但形式与原问题不同。
问题：
{question_example}
答案：
{answer_example}
改写后的问题：
{output_example}

问题：
{question}
答案：
{answer}
改写后的问题：
1.
"""


In [3]:
import logging
import random

def aug_questions_by_chat(row, max_tokens=1000):
    example = random.choice(template)
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {
                    "role": "user",
                    "content": AUGMENT_PROMPT.format(
                        question_example=example['question'],
                        answer_example=example['answer'],
                        output_example=example['output'],
                        question=row['question'],
                        answer=row['answer'])
                }
            ],
            temperature=0,
            max_tokens=max_tokens,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            stop=["\n\n"]
        )
        return response.choices[0].message.content
    except Exception as e:
        logging.error(e)
        return ""


In [4]:
import pandas
from tqdm import tqdm

tqdm.pandas()

# filename = "docx/qa/qa-chinese-003-final.csv"
filename = "book/csv/total-qa-final.csv"
df = pandas.read_csv(filename)
# rename columns
df.rename({"prompt": "question", "completion": "answer"}, axis=1, inplace=True)
df["aug_questions"] = df.progress_apply(aug_questions_by_chat, axis=1)

 16%|█▋        | 247/1497 [10:13<1:07:57,  3.26s/it]ERROR:root:The server is overloaded or not ready yet.
 17%|█▋        | 254/1497 [10:58<1:17:52,  3.76s/it]ERROR:root:The server is overloaded or not ready yet.
 79%|███████▉  | 1184/1497 [1:00:30<12:52,  2.47s/it]ERROR:root:The server is overloaded or not ready yet.
100%|██████████| 1497/1497 [1:20:21<00:00,  3.22s/it]


In [6]:
df["aug_questions"] = "1." + df.aug_questions

df.to_csv(filename.replace(".csv", "-aug.csv"), index=False)

In [9]:
# split aug_questions
import pandas as pd
import re 

aug_df = pd.read_csv(filename.replace(".csv", "-aug.csv"))
question_list, answer_list = [], []
for idx, row in aug_df.iterrows():
    aug_questions = row.aug_questions
    aug_questions = re.split(r"\d+\.", aug_questions)
    aug_questions = [q.strip() for q in aug_questions if q.strip()]
    question_list.extend(aug_questions)
    answer_list.extend([row.answer] * len(aug_questions))

new_df = pd.DataFrame({
    "question": question_list,
    "answer": answer_list
})
save_filename = filename.replace(".csv", "-aug-split.csv")
new_df.to_csv(save_filename, index=False)
new_df.to_json(save_filename.replace(".csv", ".jsonl"), 
               orient="records", force_ascii=False, lines=True)

# filter QA

In [2]:
from qa_generator import filter_qa

# filter_qa("book/csv/total-qa-final-aug-split.csv", output_format="jsonl")
filter_qa("docx/qa/qa-chinese-003-final-aug-split.csv", output_format="jsonl")

before filter:  (2483, 2)
after length filter:  (2465, 2)
after question mark filter:  (2439, 2)
after period filter:  (2439, 2)
after key word filter:  (2438, 2)
after duplicate filter:  (2431, 2)


100%|██████████| 2431/2431 [00:38<00:00, 63.60it/s]
100%|██████████| 2431/2431 [00:01<00:00, 2267.95it/s]


after similarity filter:  (2162, 4)
after rouge filter:  (2160, 4)


'docx/qa/qa-chinese-003-final-aug-split-filtered.jsonl'