In [1]:
import os
import json
import pdfplumber
from openai import OpenAI
from ragas.testset.docstore import Document
from ragas.testset.generator import TestsetGenerator
from ragas.llms.prompt import Prompt
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from ragas.llms.base import LangchainLLMWrapper
import time

# Define the folder path where the PDF files are located
pdf_folder_path = ""

# Set the OpenAI API key
client = OpenAI(
    base_url="",
    api_key=""
)

# Initialize the generator and embedding model
generator_llm = ChatOpenAI(model="gpt-4o", api_key="", base_url="")
critic_llm = ChatOpenAI(model="gpt-4o", api_key="", base_url="")
embeddings = OpenAIEmbeddings(model="text-embedding-3-large", api_key="", base_url="")

# Wrap the model
llm_wrapper = LangchainLLMWrapper(generator_llm)

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

# Define the prompt for question generation
qa_prompt = Prompt(
    name="question_generation",
    instruction="Please read the following text about anaerobic digestion and extract any relevant information related to the anaerobic digestion process, including but not limited to organic material resources and raw material sources, pre-treatment methods, preparation equipment, modification methods, or key parameters such as treatment temperature, heating rate, and residence time. Additionally, collect any detailed information on biogas yield, methane content, and the composition and properties of the digestate, including nutrient content, residual volatile solids after digestion, pH, and particle size. Based on the extracted information, context, and relevant knowledge, generate a question related to either the anaerobic digestion process or biogas yield and methane content.",
    examples=[
        {
            "answer": "The pretreatment agent significantly affects methane production during anaerobic digestion. For instance, NaOH pretreatment can enhance methane production efficiency by disrupting lignocellulosic structures, thereby increasing the availability of degradable substrates. However, if the concentration of NaOH is too high, it may lead to the accumulation of inhibitory substances like sodium ions, which can hinder microbial activity. Additionally, lignin content in the feedstock plays a crucial role, with higher lignin levels generally leading to lower methane yields due to lignin's resistance to biodegradation.",
            "context": "The study demonstrated that different chemical pretreatments, such as using NaOH, KOH, and alkaline hydrogen peroxide (AHP), have varying impacts on the methane yield from lignocellulosic waste. It was observed that lignin content above 15% in the raw material significantly reduces methane production, even after chemical pretreatment. The pretreatment conditions, including the concentration of chemicals and the digestion time, were identified as key factors influencing the methane yield.",
            "output": {"question": "How do different chemical pretreatments and lignin content in lignocellulosic feedstock affect methane production in anaerobic digestion, and what are the optimal conditions for maximizing methane yield?"}
        },
        {
            "answer": "Digestion time (DT) is a critical factor affecting methane yield in anaerobic digestion. Studies have shown that extending digestion time enhances methane production, especially during the first 18 days when microbial metabolism and the conversion of organic matter are most active. However, beyond 18 days, the rate of increase in methane yield tends to slow down, indicating that methane production reaches saturation at this stage. Additionally, the choice of pretreatment agents, such as NaOH, KOH, and AHP, significantly impacts methane yield, with NaOH and AHP showing better performance in enhancing methane yield with extended digestion time.",
            "context": "Research indicates that digestion time and the choice of pretreatment agents are crucial for optimizing methane yield in anaerobic digestion. Experimental data analysis revealed that methane yield increases significantly up to 18 days of digestion time, after which the increase rate levels off. Different pretreatment agents vary in effectiveness, with NaOH and AHP showing significant improvement in methane yield during extended digestion times.",
            "output": {"question": "How does digestion time affect methane yield in anaerobic digestion? How can the use of different pretreatment agents (such as NaOH, KOH, and AHP) be combined with digestion time to optimize methane production?"}
        }
    ],
    input_keys=["answer", "context"],
    output_key="output",
    output_type="json"
)

# Get the current working directory
current_folder_path = os.getcwd()

# Define the path for the output JSON file
output_json_path = os.path.join(current_folder_path, "all_qa_pairs.json")

# Define the path for the progress file
progress_file_path = os.path.join(current_folder_path, "progress.txt")

# Path for the error log file
error_log_path = os.path.join(current_folder_path, "error_log.txt")

# Define the maximum retry attempts
MAX_RETRIES = 3

def save_progress(filename):
    """Save the filename of the last processed file"""
    with open(progress_file_path, 'w') as f:
        f.write(filename)

def load_progress():
    """Load the filename of the last processed file"""
    if os.path.exists(progress_file_path):
        with open(progress_file_path, 'r') as f:
            return f.read().strip()
    return None

def log_error(filename, error):
    """Log the files that failed to process"""
    with open(error_log_path, 'a') as f:
        f.write(f"Failed to process {filename}: {error}\n")

def append_to_json(file_path, new_data):
    # If the file does not exist, create a new file and write the data
    if not os.path.exists(file_path):
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump([new_data], f, ensure_ascii=False, indent=4)
    else:
        # If the file exists, load old data and append new data
        with open(file_path, 'r+', encoding='utf-8') as f:
            try:
                # Load old data
                old_data = json.load(f)
                old_data.append(new_data)
                # Move the cursor to the beginning of the file
                f.seek(0)
                f.truncate()
                json.dump(old_data, f, ensure_ascii=False, indent=4)
            except json.JSONDecodeError:
                # If the file is empty or there is a format issue
                json.dump([new_data], f, ensure_ascii=False, indent=4)

# Function to extract content between Introduction and References
def extract_relevant_content(text):
    start_keywords = ["Introduction", "INTRODUCTION"]
    end_keywords = ["References", "REFERENCES", "Bibliography"]

    start_index = None
    end_index = None

    # Find the start position of the Introduction
    for keyword in start_keywords:
        start_index = text.find(keyword)
        if start_index != -1:
            break

    # Find the end position of the References
    for keyword in end_keywords:
        end_index = text.find(keyword)
        if end_index != -1:
            break

    # Extract the relevant content
    if start_index != -1 and end_index != -1 and start_index < end_index:
        return text[start_index:end_index]
    elif start_index != -1:
        return text[start_index:]
    elif end_index != -1:
        return text[:end_index]
    else:
        return None  # Return None if no matches are found

# Load the progress of the last processed file
last_processed_file = load_progress()
start_processing = False if last_processed_file else True

# Iterate over all PDF files in the folder
for filename in os.listdir(pdf_folder_path):
    if filename.endswith(".pdf"):
        # Skip already processed files based on the progress record
        if not start_processing:
            if filename == last_processed_file:
                start_processing = True
            continue

        pdf_path = os.path.join(pdf_folder_path, filename)
        retries = 0

        while retries < MAX_RETRIES:
            try:
                # Extract text from the PDF
                with pdfplumber.open(pdf_path) as pdf:
                    text = ''.join(page.extract_text() for page in pdf.pages if page.extract_text())

                # Extract content between Introduction and References
                relevant_text = extract_relevant_content(text)

                if relevant_text:  # If valid content is extracted
                    # Convert the extracted text into a Document object
                    document = Document(
                        page_content=relevant_text,
                        metadata={"filename": filename}
                    )

                    # Generate the test set using Langchain's model without passing the Prompt directly
                    testset = generator.generate_with_langchain_docs(
                                               [document],
                        test_size=10
                    )

                    # Manually generate Q&A pairs using the Prompt
                    for qa in testset.to_pandas().itertuples():
                        # Call OpenAI API to expand the answer
                        completion = client.chat.completions.create(
                            model="gpt-4o",
                            messages=[
                                {"role": "system", "content": "You're an expert on anaerobic digestion."},
                                {"role": "user", "content": f"Based on the given answer and your knowledge, provide a more comprehensive answer.\n\nAnswer: {qa.ground_truth}"}
                            ],
                            max_tokens=4096
                        )
                        full_answer = completion.choices[0].message.content

                        new_entry = {
                            "instruction": qa.question,
                            "input": "",
                            "output": full_answer
                        }

                        # Save the generated Q&A pair to the JSON file after processing each PDF
                        append_to_json(output_json_path, new_entry)

                    # Save the progress after processing each PDF
                    save_progress(filename)
                    break  # Exit retry loop after successful processing

            except Exception as e:
                retries += 1
                print(f"Error processing file {filename}: {str(e)}. Retry attempt {retries}.")
                if retries >= MAX_RETRIES:
                    print(f"File {filename} reached maximum retry attempts, skipping this file.")
                    log_error(filename, str(e))
                    break
                time.sleep(10)  # Wait for 10 seconds before retrying

print(f"All Q&A pairs have been saved to {output_json_path}")

                       


embedding nodes:   0%|          | 0/18 [00:00<?, ?it/s]

Generating:   0%|          | 0/10 [00:00<?, ?it/s]

embedding nodes:   0%|          | 0/24 [00:00<?, ?it/s]

Generating:   0%|          | 0/10 [00:00<?, ?it/s]

embedding nodes:   0%|          | 0/32 [00:00<?, ?it/s]

Generating:   0%|          | 0/10 [00:00<?, ?it/s]

所有问答对已保存为 c:\Users\yu429\Desktop\all_qa_pairs.json
