In [1]:
import os
import re
import json
import shutil
from typing import Tuple, List, Dict, Any
from pathlib import Path
from openai import OpenAI
import shared


def convert_doc_to_text(
        input_dir: str,
        output_dir: str,
        split_pattern: str | None = None,
        min_length: int | None = None,
        split_size: int = 10000,
):
    min_length = min_length or 0
    for file in os.listdir(input_dir):
        p = Path(input_dir, file)
        if p.is_dir():
            continue
        match p.suffix.lower():
            case ".txt":
                with p.open(mode="r", encoding="utf8") as f:
                    text = f.read().replace("```", "")
            case ".docx":
                if p.name.startswith("~$"):
                    continue
                text = data.doc.load_from_file(p)
                text = "".join(text)
            case ".pdf":
                text = data.pdf.load_from_file(p)
            case _:
                raise ValueError(f"Unsupported file type {p.suffix}")
        if not text:
            raise ValueError("未查找到内容")
        if split_pattern:
            text = re.split(split_pattern, text)
        if not isinstance(text, list):
            if split_size:
                text_ = []
                for i in range(0, len(text), split_size):
                    if len(text) - i >= split_size:
                        text_.append(text[i:i+split_size])
                    else:
                        text_.append(text[i:])
                text = text_
            else:
                text = [text, ]
        for t in text:
            if len(t) <= min_length:
                continue
            id_ = shared.snow.sid()
            p = Path(output_dir, f"{id_}.txt")
            with p.open(mode="w", encoding="utf8") as f:
                f.write(t)


def extract_qa_from_llm(
        text: str,
        token: str = "gxllm",
        url: str = "http://10.133.95.100:9100/v1",
        count: int = 15,
):
    if not text:
        raise ValueError("context is invalid")
    if count:
        count = f"数量不少于{count}个"
    else:
        count = "提取尽可能多的问题答案"
    prompt = f"""
将下述内容提炼问题和答案，要求{count}，答案尽可能详细，不得重复。
格式要求如下：

问题：XXXXXXX
答案：XXXXXXX

问题：XXXXXXX
答案：XXXXXXX

内容如下：

{text}

"""
    c = OpenAI(
        api_key=token,
        base_url=url,
    )
    stream = c.chat.completions.create(
        model="gxllm",
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        max_tokens=20000,
        stream=False,
    )
    text = stream.choices[0].message.content
    return text


def convert_text_to_json(
        text: str,
        question_pattern: str | None = None,
        answer_pattern: str | None = None
):
    question_pattern = question_pattern or r"问题[\d]?[:|：]+"
    answer_pattern = answer_pattern or r"答案[\d]?[:|：]+"
    content = []
    text = text.replace("```", "")
    re_ = re.split(question_pattern, text)
    for i, r in enumerate(re_):
        if not r:
            continue
        qa = re.split(answer_pattern, r)
        if len(qa) < 2:
            continue
        content.append({
            "index": shared.snow.sid(),
            "conversations": [
                {
                    "from": "user",
                    "value": qa[0].strip().replace("\n", ""),
                },
                {
                    "from": "assistant",
                    "value": qa[1].strip().replace("\n", ""),
                }
            ]
        })
    return 


def extract_question_from_llm(
        text: str,
        token: str = "qwen",
        url: str = "http://10.133.95.100:9100/v1",
        count: int = 50,
):
    if not text:
        raise ValueError("context is invalid")
    if count:
        count = f"数量不少于{count}个"
    else:
        count = "提取尽可能多的问题"
    prompt = f"""
根据下述内容提炼问题，要求{count}，不得重复。

{text}

"""
    c = OpenAI(
        api_key=token,
        base_url=url,
    )
    stream = c.chat.completions.create(
        model="qwen",
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        max_tokens=20000,
        stream=False,
    )
    text = stream.choices[0].message.content
    return text


def extract_question_from_text(
        input_dir: str,
        output_dir: str,
        token: str = "qwen",
        url: str = "http://10.133.95.100:9100/v1",
        count: int = 50,
):
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir(
            parents=True, 
            exist_ok=True,
        )
    input_dir = Path(input_dir)
    for fn in input_dir.glob("*.txt"):
        with Path(fn).open(mode="r", encoding="utf8") as f1:
            content = f1.read()
            res = extract_question_from_llm(
                text=content, token=token, url=url, count=count,
            )
            print(res)
            with Path(output_dir, fn.name).open(mode="w", encoding="utf8") as f2:
                f2.write(res)


def convert_qa_to_json(
        input_dir: str,
        output_path: str,
        question_pattern: str | None = None,
        answer_pattern: str | None = None
):
    content = []
    files = os.listdir(input_dir)
    for file in files:
        file_path = Path(input_dir) / file
        if file_path.suffix not in [".txt"]:
            continue
        with file_path.open(mode="r", encoding="utf8") as f:
            txt = f.read()
            j = convert_text_to_json(
                txt,
                question_pattern=question_pattern,
                answer_pattern=answer_pattern,
            )
            content = content + j
    if content:
        with Path(output_path).open(mode="w+", encoding="utf8") as f:
            j = json.dumps(content, ensure_ascii=False, indent=2)
            f.write(j)
            

def extract_qa_from_txt(
        input_path: str | Path,
        count: int | None = None,
        url: str = "http://10.133.95.100:9100/v1",
        min_tokens: int = 0,
        max_tokens: int = 0
):
    ret = None
    with Path(input_path).open("r", encoding="utf8") as f:
        content = f.read()
        length = len(content)
        if min_tokens and max_tokens and min_tokens < length < max_tokens:
            ret = extract_qa_from_llm(
                content,
                url=url,
                count=count
            )
    return ret


def extract_answers_from_text(
        input_dir: str | Path,
        output_dir: str | Path,
        url: str = "http://10.133.95.100:9100/v1",
        token: str = "qwen",
        max_tokens: int = 4096
):
    ret = []
    c = OpenAI(
        api_key=token,
        base_url=url,
    )
    prompt = """
根据下述内容回答问题：
{context}

请问：{question}
"""
    for fn in Path(input_dir, "ques").glob("*.txt"):
        print("-" * 10, fn.name, "-" * 10)
        with (Path(input_dir, "ques", fn.name).open(mode="r", encoding="utf8") as f1, 
              Path(input_dir, "txt", fn.name).open(mode="r", encoding="utf8") as f2):
            context = f2.read()
            ret = []
            for q in f1.readlines():
                q = re.sub("^(\d+\.?\s?)", "", q).replace("\n", " ").strip()
                p = prompt.format(context=context, question=q)
                stream = c.chat.completions.create(
                    model="qwen",
                    messages=[
                        {
                            "role": "user",
                            "content": p,
                        }
                    ],
                    max_tokens=max_tokens,
                    stream=False,
                )
                ans = stream.choices[0].message.content
                print(q)
                print(ans)
                ret.append({
                    "question": q,
                    "answer": ans,
                })
            if ret:
                with Path(output_dir, fn.name).open(mode="w+", encoding="utf8") as f:
                    j = json.dumps(ret, indent=2, ensure_ascii=False)
                    f.write(j)
                

def extract_qa_from_txt_dir(
        input_dir: str, 
        output_dir: str,
        count: int | None = None,
        url: str = "http://10.133.95.100:9100/v1",
        min_tokens: int = 0,
        max_tokens: int = 0,
):
    for i, file in enumerate(os.listdir(input_dir)):
        print("processing file: {}".format(file))
        ret = extract_qa_from_txt(
            input_path=Path(input_dir, file),
            count=count,
            url=url,
            min_tokens=min_tokens,
            max_tokens=max_tokens,
        )
        # with Path(output_dir, "{:0>10}.txt".format(i)).open(mode="w+", encoding="utf8") as f:
        #     f.write(ret)
        if ret:
            print(ret)
            with Path(output_dir, file).open(mode="w+", encoding="utf8") as f:
                f.write(ret)
                

def convert_qa_json_from_root_dir(
        root_dir: str,
        output_path: str,
        sub_dirs: List[str] | None = None,
        question_pattern: str | None = None,
        answer_pattern: str | None = None
):
    content = []
    sub_dirs = sub_dirs or ["qa", ]
    for d in os.listdir(root):
        p = Path(root_dir, d, *sub_dirs)
        for f in os.listdir(p.absolute()):
            with Path(p, f).open(mode="r", encoding="utf8") as f:
                c = f.read()
                qa = convert_text_to_json(
                    c,
                    question_pattern=question_pattern,
                    answer_pattern=answer_pattern,
                )
                content += qa
    if content:
        print(output_path)
        with Path(output_path).open(mode="w+", encoding="utf8") as f:
            j = json.dumps(content, ensure_ascii=False, indent=2)
            f.write(j)
            
            
def merge_txt(
        root_dir: str | Path,
        output_path: str | Path,
):
    output = []
    root_dir = Path(root_dir)
    output_path = Path(output_path)
    for fn in root_dir.iterdir():
        p = root_dir / fn.name / "doc"
        for fn in p.iterdir():
            with Path(fn).open(mode="r", encoding="utf8") as f:
                c = f.read()
                j = json.loads(c)
                output.extend(j)
    if output:
        with Path(output_path).open(mode="w+", encoding="utf8") as f:
            f.write(json.dumps(output, ensure_ascii=False, indent=2))
        
  

In [9]:
data_dir = Path("D:/Desktop/1")
output = []
for fn in data_dir.iterdir():
    with fn.open(mode="r", encoding="utf8") as f:
        c = f.read()
        j = json.loads(c)
        for i in j:
            # output.append({
            #     "type": "chatml",
            #     "messages": [
            #         {
            #             "role": "system",
            #             "content": "You are a helpful assistant."
            #         },
            #         {
            #             "role": "user",
            #             "content": i["question"],
            #         },
            #         {
            #             "role": "assistant",
            #             "content": i["answer"],
            #         }                
            #     ],
            #     "source": "unknown"
            # })
            output.append({
                "input": "",
                "question": i["question"],
                "answer": i["answer"],
            })
if output:
    with Path("D:/Desktop/dataset.json").open(mode="w+", encoding="utf8") as f:
        f.write(json.dumps(output, ensure_ascii=False, indent=2))

In [7]:
root = "D:/Desktop/dataset/000000"
name = "1001-个人所得税"

# convert_doc_to_text(
#     input_dir="D:/Desktop/dataset/000000/" + name + "/doc",
#     output_dir="D:/Desktop/dataset/000000/" + name + "/txt",
#     split_size=10000,
#     # split_pattern=r"第[一二三四五六七八九十百零]+章"
#     # split_pattern=r"第[一二三四五六七八九十百零]+条"
#     # split_pattern=r"\d+[\u4e00-\u9fa5]+(\r?\n)+", 
# )
# 
# extract_question_from_text(
#         input_dir="D:/Desktop/dataset/000000/" + name + "/txt",
#         output_dir="D:/Desktop/dataset/000000/" + name + "/ques",
# )

# extract_answers_from_text(
#     input_dir=Path("D:/Desktop/dataset/000000",  name),
#     output_dir=Path("D:/Desktop/dataset/000000/", name, "qa"),
# )

merge_txt(
    root_dir=root,
    output_path="D:/Desktop/data.json"
)

# extract_qa_from_txt_dir(
#     input_dir=str(Path(root, name, "txt").absolute()),
#     output_dir=str(Path(root, name, "qa").absolute()),
#     # count=30,
#     url="http://10.133.95.100:9100/v1",
#     min_tokens=20,
#     max_tokens=20000,
# )
# 
# convert_qa_to_json(
#     input_dir="D:/Desktop/3",
#     output_path="D:/Desktop/json.json",
# )

# convert_qa_json_from_root_dir(
#     root_dir=root,
#     output_path="D:/Desktop/json.json"
# )

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb0 in position 14: invalid start byte