In [None]:
!pip install langchain_community langchain langchain_openai PyPDF2 pypdf chromadb

In [None]:
import pandas as pd
import PyPDF2
from typing import List
from pathlib import Path
from io import BytesIO
from uuid import uuid4
import chromadb
from openai import OpenAI
from tqdm import tqdm  
from time import time
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings


In [None]:
FILE_PATH = "pdf_file_path"
CHROMA_PATH = "chromadb_path"
OPENAI_API_KEY = "sk-svcacc_bWh4fRY73mDJU-ZFK9T3BlbkF"

In [None]:
def load_documents(file_path: str) -> List[str]:
    """
    Load and process a local PDF document using PyPDF2.

    Args:
        file_path (str): Path to the local PDF document (e.g., 'path/to/file.pdf').

    Returns:
        List[str]: A list of document pages processed from the PDF.

    Raises:
        ValueError: If the file path is invalid or the file is not a PDF.
        Exception: If there's an error loading or processing the PDF.
    """
    if not file_path.lower().endswith('.pdf'):
        raise ValueError("Invalid file path. Must be a PDF file.")

    try:
        # Read the PDF file from local file system
        with open(file_path, 'rb') as file:
            pdf_content = BytesIO(file.read())

        # Initialize the PDF reader
        reader = PyPDF2.PdfReader(pdf_content)
        print(f"PDF loaded from path: {file_path}")

        # Extract text from each page
        all_pages = [page.extract_text() for page in reader.pages]

        print(f"Document with {len(all_pages)} pages processed.")
        return all_pages

    except Exception as e:
        print(f"Error loading PDF from local file: {e}")
        raise


In [None]:
import re
from typing import List

def preprocess_text(text: str) -> str:
    """
    Preprocesses the text by removing unwanted symbols and normalizing it.

    Args:
        text (str): The input text to be cleaned.

    Returns:
        str: The cleaned and preprocessed text.
    """

    # Normalize whitespace (remove extra spaces, newlines, etc.)
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def split_text(all_pages: List[str], chunk_size: int = 750, overlap: int = 100) -> List[str]:
    """
    Splits a list of text pages into chunks with overlap, after cleaning and preprocessing the text.

    Args:
        all_pages (list[str]): A list of text pages.
        chunk_size (int, optional): Size of each text chunk. Defaults to 750 characters.
        overlap (int, optional): Overlap between chunks. Defaults to 100 characters.

    Returns:
        list[str]: A list of preprocessed and split text chunks.
    """
    all_chunks = []
    for page_text in all_pages:
        # Preprocess the text before splitting
        clean_text = preprocess_text(page_text)
        chunks = []
        start = 0
        while start < len(clean_text):
            end = min(start + chunk_size, len(clean_text))
            chunk = clean_text[start:end]
            chunks.append(chunk)
            start += chunk_size - overlap
        all_chunks.extend(chunks)

    return all_chunks


In [None]:
def create_documents_from_chunks(chunks: List[str], source: str) -> List[Document]:
    """
    Create a list of Document instances from text chunks.

    Args:
        chunks (List[str]): List of text chunks to be converted into Document instances.
        source (str): The source information for the metadata of each Document.

    Returns:
        List[Document]: A list of Document instances.
    """
    documents = []

    for chunk in chunks:
        doc_id = str(uuid4())  # Generate a unique UUID for each document
        document = Document(
            page_content=chunk,
            metadata={"source": source},
            id=doc_id,
        )
        documents.append(document)

    return documents

def add_documents_to_vector_store(chunks: List[str], source: str):
    """
    Create documents from text chunks and add them to the vector store.

    Args:
        chunks (List[str]): List of text chunks to be converted into Document instances.
        source (str): The source information for the metadata of each Document.
    """
    embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY)

    # Create documents from chunks
    documents = create_documents_from_chunks(chunks, source)
    CHROMA_PATH = "/content/chroma_vectordb_2" 

    # Generate unique IDs for each document
    uuids = [str(uuid4()) for _ in range(len(documents))]

    # Create and persist the vector store
    db = Chroma.from_documents(documents, embeddings, persist_directory=CHROMA_PATH, ids=uuids)

    # Reload the Chroma database
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embeddings)
    print(f"Added {len(documents)} documents to the vector store.")


In [None]:
def retrieve_and_invoke(query_text: str):
    """
    Retrieve top chunks from the vector store and invoke the OpenAI API.

    Args:
        query_text (str): The question to be answered.

    Returns:
        str: The generated response from the OpenAI API.
        str: The context used for generating the response.
    """
    embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY)
    vectordb = Chroma(persist_directory="/content/chroma_vectordb_2", embedding_function=embeddings)
    retriever = vectordb.as_retriever(search_kwargs={"k": 7})
    results = retriever.invoke(query_text)

    # Construct the context from the top unique documents
    unique_texts = set(doc.page_content for doc in results)
    context_text = "\n\n---\n\n".join(unique_texts)

    # Define the prompt template
    PROMPT_TEMPLATE = """
    # CONTEXT
    You are given a question from an audit inspection and the relevant content related to that question. Your task is to analyze the content to summarize the findings, provide a clear conclusion, and deliver a one-word result based on the audit's compliance with the question.
    Question: {question}
    Content: {content}
    
    # OBJECTIVE
    Analyze the provided context (question and related content) and return three outputs:
    
    Findings: Summarize the relevant sections and subsections from the content.
    Conclusion: Provide a full-sentence conclusion based on the findings that directly answers the question.
    Result: Give a one-word result based on the conclusion: Pass, Fail, NA (Not Applicable), or Uncertain.
    
    # STYLE
    Findings: Summarize, rephrase, and clarify the key points from the content.
    Conclusion: Provide a clear, concise sentence that directly answers the question.
    Result: Deliver the result in one word (Pass, Fail, NA, or Uncertain).
    
    # TONE
    Formal, concise, and objective. Provide clear and accurate responses suitable for audit reviewers.
    
    # AUDIENCE
    The target audience is audit file reviewers who will use your analysis to determine the company's compliance with the audit question.
    
    # RESPONSE FORMAT
    Return your analysis in JSON format with the following structure:
    Findings: A summary of relevant sections and subsections.
    Conclusion: A clear sentence summarizing the answer to the question.
    Result: A one-word result (Pass, Fail, NA, or Uncertain).
    """

    prompt = PROMPT_TEMPLATE.format(question=query_text, content=context_text)

    # Generate a response using the OpenAI API
    client = OpenAI(api_key="YOUR_API_KEY")
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        temperature=0,
        messages=[{"role": "system", "content": prompt}],
    )

    response_text = completion.choices[0].message.content
    return response_text, context_text


In [None]:
pdf_content = load_documents(FILE_PATH)
chunks = split_text(pdf_content)
vectordb = add_documents_to_vector_store(chunks)

In [None]:
import pandas as pd
from tqdm import tqdm
from time import time
import json  # Import the json module

def process_questions_and_save_results(csv_path: str, output_csv_path: str):
    """
    Load questions from a CSV file, process them, and save the generated answers to a new CSV file.

    Args:
        csv_path (str): Path to the CSV file containing questions.
        output_csv_path (str): Path to the output CSV file for saving results.
    """
    # Load questions from CSV
    questions_df = pd.read_csv(csv_path)
    
    # Initialize result list and start timer
    results = []
    start_time = time()

    # Iterate through questions and get answers with tqdm progress bar
    for index, row in tqdm(questions_df.iterrows(), total=questions_df.shape[0], desc="Processing Questions"):
        question = row['Question']
        try:
            # Assuming retrieve_and_invoke() returns a JSON string
            json_output_str = retrieve_and_invoke(question)  # Call the function to get the JSON output
            
            # Parse the JSON string into a Python dictionary
            json_output = json.loads(json_output_str)

            # Extract relevant parts from the JSON output
            findings = json_output.get('Findings', '')
            conclusion = json_output.get('Conclusion', '')
            result = json_output.get('Result', '')

            # Append structured data to results
            results.append({
                'Question': question,
                'Findings': findings,
                'Conclusion': conclusion,
                'Result': result
            })
        except (ValueError, json.JSONDecodeError):
            results.append({
                'Question': question, 
                'Findings': "Error occurred during processing.",
                'Conclusion': "Provided context does not contain the answer.",
                'Result': "Uncertain"
            })

    # End timer
    end_time = time()
    print(f"Time taken: {end_time - start_time:.2f} seconds")

    # Create DataFrame from results and save to CSV
    result_df = pd.DataFrame(results)
    result_df.to_csv(output_csv_path, index=False)

    # Print the result DataFrame
    print(result_df)

# Example usage
process_questions_and_save_results("/content/questions_conclusions_results.csv", "answers_on_new_qset__Henan_SMETA.csv")
