# Generate QA in a few-shot learning way

In [1]:
import os
import openai

proxy = 'http://dell-1.star:7890' # 3090 docker
# proxy = 'http://127.0.0.1:7890' # clash
# proxy = 'http://127.0.0.1:1080' # naiveproxy

os.environ['http_proxy'] = proxy 
os.environ['HTTP_PROXY'] = proxy
os.environ['https_proxy'] = proxy
os.environ['HTTPS_PROXY'] = proxy
openai.api_key_path = ".openai-key2"

In [10]:
import json
import random

with open("template/question_seeds.json") as fin:
    question_seeds = json.load(fin)
with open("template/answer_seeds.json") as fin:
    answer_seeds = json.load(fin)

QUESTION_PROMPT = """请参考如下示例，根据给定文本生成问题，要求尽可能使用简体中文，且表述清晰详细

文本：{context_example}

问题：
{questions_example}

文本: {context}

问题:
1.
"""

ANSWER_PROMPT = """请参考如下示例，根据给定文本生成问题的答案，要求尽可能使用简体中文，且从文本中找不到答案时回答“无法确定”

文本: {context_example}

问题:
{questions_example}

答案:
{answers_example}

文本: {context}

问题:
{questions}

答案:
1.
"""


In [11]:
import logging

def gen_questions_by_chat(row, max_tokens=1000):
    example = random.choice(question_seeds)
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {
                    "role": "user",
                    "content": QUESTION_PROMPT.format(
                        context_example=example['context'],
                        questions_example=example['questions'],
                        context=row.context)
                }
            ],
            temperature=0,
            max_tokens=max_tokens,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            stop=["\n\n"]
        )
        return response.choices[0].message.content
    except Exception as e:
        logging.error(e)
        return ""


def gen_answers_by_chat(row, max_tokens=1000):
    example = random.choice(answer_seeds)
    try:
        response = openai.ChatCompletion.create(
            model = "gpt-3.5-turbo",
            messages=[
                {
                    "role": "user",
                    "content": ANSWER_PROMPT.format(
                        context_example=example['context'],
                        questions_example=example['questions'],
                        answers_example=example['answers'],
                        context=row.context, 
                        questions=row.questions)
                }
            ],
            temperature=0,
            max_tokens=max_tokens,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )
        return response.choices[0].message.content
    except Exception as e:
        logging.error(e)
        return ""

## load file

In [1]:
import pandas as pd
from tqdm import tqdm

tqdm.pandas()

filename = "book/few-shot/book.csv"
df = pd.read_csv(filename)
if "context" not in df.columns:
    assert "summary" in df.columns and "content" in df.columns,\
        "Either 'context' or 'summary' and 'content' must be in the csv file"
    df["context"] = "summary: " + df["summary"] + "\ncontent: " + df["content"]


## generate questions

In [12]:
df["questions"] = df.progress_apply(gen_questions_by_chat, axis=1)
df["questions"] = "1." + df.questions

df.to_csv(filename.replace(".csv", "-questions.csv"), index=False)

  4%|▍         | 26/588 [01:41<35:27,  3.79s/it]  ERROR:root:This model's maximum context length is 4097 tokens. However, your messages resulted in 6359 tokens. Please reduce the length of the messages.
  8%|▊         | 49/588 [03:40<47:19,  5.27s/it]  ERROR:root:This model's maximum context length is 4097 tokens. However, you requested 4408 tokens (3408 in the messages, 1000 in the completion). Please reduce the length of the messages or completion.
 51%|█████     | 301/588 [20:18<12:56,  2.70s/it]  ERROR:root:The server is overloaded or not ready yet.
 69%|██████▊   | 403/588 [25:29<07:43,  2.51s/it]ERROR:root:The server is overloaded or not ready yet.
 92%|█████████▏| 543/588 [32:30<02:45,  3.69s/it]ERROR:root:This model's maximum context length is 4097 tokens. However, your messages resulted in 6429 tokens. Please reduce the length of the messages.
 95%|█████████▍| 558/588 [33:32<01:56,  3.89s/it]ERROR:root:The server is overloaded or not ready yet.
 96%|█████████▋| 566/588 [34:29<

## generate answers

In [13]:
df["answers"] = df.progress_apply(gen_answers_by_chat, axis=1)
df["answers"] = "1." + df.answers

df.to_csv(filename.replace(".csv", "-qa-raw.csv"), index=False)

  4%|▍         | 26/588 [04:38<1:12:05,  7.70s/it]ERROR:root:This model's maximum context length is 4097 tokens. However, your messages resulted in 6412 tokens. Please reduce the length of the messages.
  8%|▊         | 49/588 [08:31<1:04:49,  7.22s/it]ERROR:root:This model's maximum context length is 4097 tokens. However, you requested 4779 tokens (3779 in the messages, 1000 in the completion). Please reduce the length of the messages or completion.
 10%|▉         | 56/588 [09:26<49:03,  5.53s/it]  ERROR:root:This model's maximum context length is 4097 tokens. However, you requested 4572 tokens (3572 in the messages, 1000 in the completion). Please reduce the length of the messages or completion.
 58%|█████▊    | 343/588 [37:33<21:39,  5.30s/it]  ERROR:root:The server is overloaded or not ready yet.
 59%|█████▉    | 348/588 [38:26<29:29,  7.37s/it]ERROR:root:The server is overloaded or not ready yet.
 92%|█████████▏| 543/588 [58:16<04:43,  6.31s/it]ERROR:root:This model's maximum cont

## split and filter qa

In [3]:
from qa_evaluator import split_qa, filter_qa

save_filename = split_qa(filename.replace(".csv", "-qa-raw.csv"))
filter_qa(save_filename, output_format="jsonl")

before filter:  (1840, 2)
after length filter:  (1637, 2)
after question mark filter:  (1608, 2)
after period filter:  (1562, 2)
after key word filter:  (1553, 2)
after duplicate filter:  (1533, 2)


100%|██████████| 1533/1533 [00:24<00:00, 63.69it/s]
100%|██████████| 1533/1533 [00:00<00:00, 2115.16it/s]

after similarity filter:  (1139, 4)
after rouge filter:  (1115, 4)





'book/few-shot/book-qa-filtered.jsonl'