In [None]:
# Core dependencies
pip install PyPDF2  # For PDF processing
pip install pandas  # For data manipulation
pip install openai  # For OpenAI API integration
pip install networkx  # For graph operations
pip install chromadb  # Vector database
pip install langchain  # For document processing and embeddings
pip install scikit-learn  # For machine learning utilities
pip install nltk  # For natural language processing
pip install sentence-transformers  # For text embeddings
pip install tqdm  # For progress bars
pip install langchain-community  # For OpenAI embeddings

pip install "PyPDF2>=3.0.0"
pip install "langchain>=0.1.0"
pip install "chromadb>=0.3.0"
pip install "sentence-transformers>=2.2.0"


In [None]:
import os
import json
import logging
import PyPDF2
import pandas as pd
import openai  # OpenAI API for text completions
import networkx as nx
from typing import List, Dict, Optional, Tuple
from pathlib import Path
import time
from uuid import uuid4
import chromadb
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
import re
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from collections import defaultdict
from datetime import datetime
from sklearn.cluster import KMeans
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings.openai import OpenAIEmbeddings

import nltk
nltk.download('punkt')

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('audit_analysis.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Constants
CHROMA_PATH = "./chroma_vectordb_openAI1207"  # Update to your preferred path
MAX_RETRIES = 3
RETRY_DELAY = 1
MAX_CONTEXT_LENGTH = 15000
CHUNK_SIZE = 1000
SIMILARITY_THRESHOLD = 0.7
SEMANTIC_WEIGHT = 0.8
SEQUENTIAL_WEIGHT = 0.6

# Replace with your actual OpenAI API key
from openai import OpenAI
from langchain_community.embeddings import OpenAIEmbeddings

# Your API key
api_key = "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"

# Method 1: Set the environment variable correctly
import os
os.environ["OPENAI_API_KEY"] = api_key  # "OPENAI_API_KEY" is the required name
embeddings = OpenAIEmbeddings()



PROMPT_TEMPLATE = """
# CONTEXT
You are given a question from an audit inspection and the relevant content related to that question. Your task is to analyze the content to summarize the findings, provide a clear conclusion, and deliver a one-word result based on the audit's compliance with the question.
Question: {question}
Content: {content}

# OBJECTIVE
Analyze the provided context (question and related content) and return four outputs:

Findings: Summarize the relevant sections and subsections from the content.
Conclusion: Provide a full-sentence conclusion based on the findings that directly answers the question.
Result: Give a one-word result based on the conclusion: Pass, Fail, NA (Not Applicable), or Uncertain.
Confidence_Perc: Provide a confidence score (out of 100) that reflects the certainty of the conclusion.

# RESPONSE FORMAT
Return your analysis in JSON format with the following structure:
{{
    "findings": "Your summary of findings here",
    "conclusion": "Your conclusion here",
    "result": "Pass/Fail/NA/Uncertain",
    "confidence": number
}}
"""

# Semantic chunking function

def setup_nltk():
    """
    Set up NLTK by downloading required resources.
    """
    try:
        import nltk
        nltk.download('punkt', quiet=True)
        nltk.download('punkt_tab', quiet=True)
        return True
    except Exception as e:
        logger.warning(f"NLTK setup failed: {e}")
        return False

def simple_sentence_split(text: str) -> List[str]:
    """
    Simple sentence splitting fallback method.
    """
    # Split on common sentence endings
    sentences = []
    current = ""

    # Add space after common sentence endings if missing
    text = text.replace('.',' . ').replace('!',' ! ').replace('?',' ? ')
    text = ' '.join(text.split())  # Normalize spaces

    for word in text.split():
        current += word + " "
        if word in ['.', '!', '?']:
            current = current.strip()
            if current:
                sentences.append(current)
            current = ""

    if current.strip():
        sentences.append(current.strip())

    return sentences or [text]

def semantic_chunk(text: str, sentence_model, max_chunk_size: int = 750, min_chunk_size: int = 200) -> List[str]:
    try:
        sentences = sent_tokenize(text)
        if not sentences:
            return [text] if text.strip() else []

        # Remove empty sentences and normalize whitespace
        sentences = [s.strip() for s in sentences if s.strip()]
        embeddings = sentence_model.encode(sentences)
        text_length = sum(len(s) for s in sentences)
        num_clusters = max(1, text_length // max_chunk_size)

        kmeans = KMeans(n_clusters=min(num_clusters, len(sentences)), random_state=42)
        clusters = kmeans.fit_predict(embeddings)

        grouped_sentences = defaultdict(list)
        for sentence, cluster in zip(sentences, clusters):
            grouped_sentences[cluster].append(sentence)

        chunks = []
        for cluster_sentences in grouped_sentences.values():
            chunks.append(' '.join(cluster_sentences))
        return chunks

    except Exception as e:
        logger.error(f"Error in semantic chunking: {e}")
        return [text] if text.strip() else []

class DocumentProcessor:
    def load_pdf(self, pdf_path: str) -> List[str]:
        try:
            with open(pdf_path, "rb") as file:
                reader = PyPDF2.PdfReader(file)
                pages = [page.extract_text() for page in reader.pages if page.extract_text()]
            return pages
        except Exception as e:
            logger.error(f"Failed to load PDF {pdf_path}: {e}")
            return []





    def process_pages(self, pages: List[str], sentence_model) -> List[str]:
        all_chunks = []
        for page in pages:
            chunks = semantic_chunk(page, sentence_model)
            all_chunks.extend(chunks)
        return all_chunks

# KnowledgeNode and KnowledgeGraph Classes
class KnowledgeNode:
    def __init__(self, node_id, text, embedding):
        self.node_id = node_id
        self.text = text
        self.embedding = embedding

class KnowledgeGraph:
    def __init__(self):
        self.graph = nx.Graph()

    def add_node(self, node: KnowledgeNode):
        self.graph.add_node(node.node_id, text=node.text, embedding=node.embedding)

    def add_edge(self, node1_id, node2_id, relation, weight):
        self.graph.add_edge(node1_id, node2_id, relation=relation, weight=weight)

    def get_statistics(self):
        return {
            "total_nodes": self.graph.number_of_nodes(),
            "total_edges": self.graph.number_of_edges()
        }

    def get_subgraph(self, node_ids, depth=1):
        subgraph_nodes = set(node_ids)
        for node_id in node_ids:
            neighbors = nx.single_source_shortest_path_length(self.graph, node_id, cutoff=depth)
            subgraph_nodes.update(neighbors.keys())
        return self.graph.subgraph(subgraph_nodes)

    def get_node_text(self, node_id):
        return self.graph.nodes[node_id]["text"] if node_id in self.graph.nodes else None






class EnhancedAuditAnalyzer:

    def __init__(self):
        self.vector_store = None
        self.knowledge_graph = KnowledgeGraph()
        self.document_processor = DocumentProcessor()
        self.embeddings = embeddings  # Store the embeddings instance

    def setup_environment(self):
        try:
            # Set up the OpenAI API key
            os.environ["OPENAI_API_KEY"] = api_key
            logger.info("Environment setup completed successfully")
            return True
        except Exception as e:
            logger.error(f"Environment setup failed: {e}")
            raise

    def process_document(self, pdf_path: str, sentence_model) -> List[str]:
        pages = self.document_processor.load_pdf(pdf_path)
        if not pages:
            return []
        return self.document_processor.process_pages(pages, sentence_model)

    def build_knowledge_base(self, chunks: List[str], source: str):
        documents = []
        embeddings_list = []

        try:
            for i, chunk in enumerate(chunks):
                embedding = self.embeddings.embed_query(chunk)
                embeddings_list.append((chunk, embedding))
                doc = Document(page_content=chunk, metadata={"source": source, "chunk_index": i})
                documents.append(doc)
                node = KnowledgeNode(i, chunk, embedding)
                self.knowledge_graph.add_node(node)

            if documents:
                self.vector_store = Chroma.from_documents(
                    documents,
                    embedding=self.embeddings,
                    persist_directory=CHROMA_PATH
                )
            else:
                logger.error("No valid documents were created, skipping vector store creation.")

            self._build_graph_edges_with_precomputed_embeddings(embeddings_list)

        except Exception as e:
            logger.error(f"Error building knowledge base: {e}")
            raise

    def _build_graph_edges_with_precomputed_embeddings(self, embeddings: List[Tuple[str, np.ndarray]]):
        try:
            for i in range(len(embeddings)):
                for j in range(i + 1, len(embeddings)):
                    try:
                        similarity = cosine_similarity(
                            [embeddings[i][1]], [embeddings[j][1]]
                        )[0][0]
                        if similarity > SIMILARITY_THRESHOLD:
                            self.knowledge_graph.add_edge(
                                i, j, "semantic_similarity",
                                similarity * SEMANTIC_WEIGHT
                            )
                        if j == i + 1:
                            self.knowledge_graph.add_edge(
                                i, j, "sequential", SEQUENTIAL_WEIGHT
                            )
                    except Exception as e:
                        logger.error(f"Error building edge between chunks {i} and {j}: {e}")
                        continue
            logger.info("Graph edges successfully built.")
        except Exception as e:
            logger.error(f"Error in _build_graph_edges: {e}")
            raise

    def get_relevant_context(self, query: str) -> str:
        if not self.vector_store:
            return ""
        results = self.vector_store.similarity_search(query, k=3)
        return "\n\n".join(doc.page_content for doc in results)

    def analyze_query(self, query: str) -> Dict:
        """
        Analyze a query using the knowledge base.

        Args:
            query: The query string to analyze

        Returns:
            Dict containing findings, conclusion, result, and confidence
        """
        try:
            context = self.get_relevant_context(query)
            if not context:
                return {
                    "findings": "No relevant context found",
                    "conclusion": "Uncertain",
                    "result": "Uncertain",
                    "confidence": 0
                }

            # Create OpenAI client instance
            client = OpenAI()

            # Use the Chat Completion API with proper client instantiation
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {
                        "role": "system",
                        "content": "You are an audit analysis assistant that provides structured responses in JSON format."
                    },
                    {
                        "role": "user",
                        "content": PROMPT_TEMPLATE.format(question=query, content=context)
                    }
                ],
                temperature=0
            )

            if response and response.choices:
                response_text = response.choices[0].message.content.strip()
                return self._parse_response(response_text)

            return {
                "findings": "No response received from OpenAI API",
                "conclusion": "Uncertain",
                "result": "Uncertain",
                "confidence": 0
            }

        except Exception as e:
            logger.error(f"Error in analyze_query: {e}")
            return {
                "findings": f"Error: {str(e)}",
                "conclusion": "Uncertain",
                "result": "Uncertain",
                "confidence": 0
            }

    def _parse_response(self, response_text: str) -> Dict:
        try:
            clean_text = response_text.strip()
            json_match = re.search(r'(\{[\s\S]*\})', clean_text)
            if not json_match:
                logger.warning("No JSON found in response")
                return {
                    "findings": clean_text[:500],
                    "conclusion": "Error parsing response",
                    "result": "Uncertain",
                    "confidence": 0
                }
            result = json.loads(json_match.group(1))
            required_fields = {'findings', 'conclusion', 'result', 'confidence'}
            if not all(field in result for field in required_fields):
                missing_fields = required_fields - set(result.keys())
                logger.warning(f"Missing required fields in response: {missing_fields}")
                return {
                    "findings": "Missing required fields in response",
                    "conclusion": "Error parsing response",
                    "result": "Uncertain",
                    "confidence": 0
                }
            valid_results = {'Pass', 'Fail', 'NA', 'Uncertain'}
            if result['result'] not in valid_results:
                logger.warning(f"Invalid result value: {result['result']}")
                result['result'] = 'Uncertain'
            try:
                confidence = float(result['confidence'])
                result['confidence'] = max(0, min(100, confidence))
            except (ValueError, TypeError):
                logger.warning("Invalid confidence value")
                result['confidence'] = 0
            return result
        except Exception as e:
            logger.error(f"Error parsing response: {e}")
            return {
                "findings": response_text[:500],
                "conclusion": "Error parsing response",
                "result": "Uncertain",
                "confidence": 0
            }


class AuditAnalysisRunner:
    def __init__(self, pdf_path: str, questions_path: str, output_path: str = None):
        self.pdf_path = pdf_path
        self.questions_path = questions_path
        self.output_path = output_path or f"audit_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        self.analyzer = None
        self.sentence_model = None

    def setup(self):
        self.sentence_model = SentenceTransformer('multi-qa-mpnet-base-cos-v1')
        self.analyzer = EnhancedAuditAnalyzer()


    def load_questions(self) -> List[Dict]:
        df = pd.read_csv(self.questions_path)
        return df.to_dict('records')

    def process_document(self):
        chunks = self.analyzer.process_document(self.pdf_path, self.sentence_model)
        self.analyzer.build_knowledge_base(chunks, self.pdf_path)

    def run_analysis(self):
        questions = self.load_questions()
        results = []
        for question in tqdm(questions, desc="Analyzing questions"):
            result = self.analyzer.analyze_query(question['Question'])
            results.append({
                'Question': question['Question'],
                'Findings': result['findings'],
                'Conclusion': result['conclusion'],
                'Result': result['result'],
                'Confidence': result['confidence']
            })
        df = pd.DataFrame(results)
        df.to_csv(self.output_path, index=False)

    def get_summary_statistics(self):
        df = pd.read_csv(self.output_path)
        stats = {
            'total_questions': len(df),
            'results_distribution': df['Result'].value_counts().to_dict(),
            'average_confidence': df['Confidence'].mean(),
        }
        return stats


# Main execution function
def main():
    pdf_path = "/content/Hong Yan textiles SA  2023.09.27 - QIMA Ethical Audit Report (RCloud) (1).pdf"
    questions_path = "/content/Question_Bank_New (1).csv"
    output_path = "audit_results_openai.csv"

    runner = AuditAnalysisRunner(pdf_path, questions_path, output_path)
    runner.setup()
    runner.process_document()
    runner.run_analysis()

    stats = runner.get_summary_statistics()
    print(json.dumps(stats, indent=2))

if __name__ == "__main__":
    main()