## Document Parsing

In [None]:
from langchain_core.documents import Document
from docx import Document

def read_docx(file_path):
    doc = Document(file_path)
    text = "\n".join([para.text for para in doc.paragraphs])  # Extract all paragraphs
    return text

# Example usage
best_practices_file_path = "./data/The CyberGov™ Framework – Optimizing Your Cybersecurity Posture v. 8.0 14 Dec 2023.docx"
board_report_file_path = "./data/Sample board of directors meeting.docx"
board_memo_file_path = "./data/Board Memo 1 March 14.docx"

best_practices_text = read_docx(best_practices_file_path)
board_report_text = read_docx(board_report_file_path)
board_memo_text = read_docx(board_memo_file_path)

best_practices_doc = Document(best_practices_text, metadata={"source":best_practices_file_path})
board_report_doc = Document(board_report_text, metadata={"source":board_report_file_path})
board_memo_doc = Document(board_memo_text, metadata={"source":board_memo_file_path})

In [None]:
from langchain_core.documents import Document

best_practices_doc = Document(best_practices_text, metadata={"source":best_practices_file_path})
board_report_doc = Document(board_report_text, metadata={"source":board_report_file_path})
board_memo_doc = Document(board_memo_text, metadata={"source":board_memo_file_path})

## First Attempt at the Baseline LLM Solution

In [None]:
import os
from docx import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document as LCDocument
from langchain.llms import OpenAI
from dotenv import load_dotenv

load_dotenv()

# Set OpenAI API Key
openai_api_key = os.environ.get("OPENAI_API_KEY")

# Split document into smaller chunks for embeddings
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_text(board_report_text)

# Convert chunks into LangChain Document objects
docs = [LCDocument(page_content=chunk) for chunk in chunks]

# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings()

# Store embeddings in FAISS vector database
vector_store = FAISS.from_documents(docs, embeddings)

# Save FAISS index for later use
vector_store.save_local("faiss_index")

# Load FAISS index (optional, for retrieval)
vector_store = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

In [None]:
from openai import OpenAI
from pprint import pprint
from pydantic import BaseModel

class ComplianceReport(BaseModel):
    status: str
    explanation: str
    corrective_measures: str

# Initialize OpenAI client
client = OpenAI()

# Define best practice statement
practice = "Practice 4.1: Policies, processes, and procedures for managing cyber breaches internally are established and reviewed at least annually."

# Retrieve top 5 most relevant documents with similarity scores
retrieved_docs = vector_store.similarity_search_with_score(practice, k=5)

# Format retrieved documents into a structured prompt
context = "\n\n".join([f"Document {i+1}:\n{doc.page_content}" for i, (doc, score) in enumerate(retrieved_docs)])

# Construct the final prompt
final_prompt = f"""
You are an expert compliance analyst tasked with evaluating the compliance status of the best practice based on the provided context. 
The context consists of relevant remarks from board members. Clearly state the status in your response as "Pass" or "Fail" at the top.

### Best Practice:
{practice}

### Context:
{context}

### Question:
Based on the context, does the organization comply with this best practice? Provide reasoning if it doesn't and corrective measures. Your description should be easy to comprehend. If you don't find any relevant information, you can state that as well.

### Answer:
"""

# completion = client.chat.completions.create(
completion = client.beta.chat.completions.parse(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are an expert compliance analyst."},
        {"role": "user", "content": final_prompt}
    ],
    response_format=ComplianceReport
)

pprint(completion.choices[0].message.content)

## Neo4j Graph Over Best Practices

In [None]:
from langchain_neo4j import Neo4jGraph

graph = Neo4jGraph(url="bolt://localhost:7687", username="neo4j", password="password")

In [None]:
import re

def parse_best_practices(text):
    principles = []
    current_principle = None
    
    lines = text.strip().split("\n")  # Split by lines
    for line in lines:
        line = line.strip()
        
        principle_match = re.match(r"^Principle (\d+\.\d+): (.+)", line)
        practice_match = re.match(r"^Practice (\d+\.\d+): (.+)", line)
        
        if principle_match:
            if current_principle:
                principles.append(current_principle)
            current_principle = {
                "id": principle_match.group(1),
                "name": principle_match.group(2),
                "practices": []
            }
        elif practice_match and current_principle:
            current_principle["practices"].append({
                "id": practice_match.group(1),
                "description": practice_match.group(2)
            })
    
    if current_principle:
        principles.append(current_principle)
    
    return principles

principles=parse_best_practices(best_practices_text)

# workaround for an anomaly in the data
# principles[2]['practices'][1]['id'] = '3.1.1'

In [None]:
# List of schema queries
queries = [
    """
    CREATE CONSTRAINT unique_principle_id IF NOT EXISTS 
    FOR (p:Principle) REQUIRE p.id IS UNIQUE
    """,
    """
    CREATE CONSTRAINT unique_practice_id IF NOT EXISTS 
    FOR (pr:Practice) REQUIRE pr.id IS UNIQUE
    """,
    """
    CREATE CONSTRAINT unique_keyindicator_details IF NOT EXISTS 
    FOR (ki:KeyIndicator) REQUIRE ki.details IS UNIQUE
    """,
]

for query in queries:
    graph.query(query)

In [None]:
# Inserting data into Neo4j
for principle in principles:
    # Ensure Principle node is created or matched
    graph.query(
        """
        MERGE (p:Principle {id: $principle_id})
        ON CREATE SET p.name = $principle_name
        """,
        params={"principle_id": principle["id"], "principle_name": principle["name"]},
    )

    for practice in principle["practices"]:
        graph.query(
            """
            MATCH (p:Principle {id: $principle_id})  // Ensure Principle exists
            MERGE (pr:Practice {id: $practice_id})  // Ensure unique Practice by ID
            ON CREATE SET pr.description = $practice_desc  // Set description only on creation
            MERGE (p)-[:HAS_PRACTICE]->(pr)  // Create relationship
            """,
            params={
                "principle_id": principle["id"],
                "practice_id": practice["id"],
                "practice_desc": practice["description"],
            },
        )


In [191]:
graph.query(
    """
    MATCH (n) RETURN count(n) AS node_count;
    """
)

[{'node_count': 42}]

In [189]:
graph.query(
    """
    MATCH (n) DETACH DELETE n;
    """
)

[]

In [None]:
# key indicators mapped to their corresponding practice IDs (provided by Bob)
key_indicators = {
    "2.6": [
        "How can we guarantee that all subsidiaries fully implement cybersecurity communication channels?",
        "What barriers might delay the complete deployment of these communication frameworks, and how can they be mitigated?",
        "How do we foster greater trust among suppliers and third parties to encourage transparency in cybersecurity risk sharing?",
        "Could leveraging contractual obligations improve data-sharing practices with external partners?",
    ],
    "3.4": [
        "What mechanisms can be implemented to extend supply chain cybersecurity risk management to international vendors?",
        "What challenges might arise from merging cybersecurity risk with enterprise risk management, and how can they be resolved?",
        "How can board members be encouraged to perceive cybersecurity as a key component of corporate governance rather than a standalone function?",
    ],
    "4.5": [
        "How can we better model cyber risks to enhance response planning in unpredictable scenarios?",
        "What initiatives can be introduced to align staff and board perspectives on a unified incident response strategy?",
        "How can we ensure that real-world data collection is comprehensive and accessible across all business units?",
    ],
}

# Insert key indicators and establish relationships
for practice_id, questions in key_indicators.items():
    for question in questions:
        graph.query(
            """
            MATCH (pr:Practice {id: $practice_id})
            MERGE (ki:KeyIndicator {question: $question})
            MERGE (pr)-[:HAS_KEY_INDICATOR]->(ki)
            """,
            params={"practice_id": practice_id, "question": question},
        )


In [None]:
# Retrieve principle and related practices along with key indicators (if any)
principle_id = "2.0"

result = graph.query(
    """
    MATCH (p:Principle)-[:HAS_PRACTICE]->(pr:Practice)-[:HAS_KEY_INDICATOR]->(ki:KeyIndicator)
    WHERE p.id = $principle_id
    RETURN p, pr, ki;
    """,
    params={"principle_id": principle_id}
)

print(result)

## Implementation of Hybrid Approach

In [None]:
def traverse_and_print_key_indicators(graph, principle_id):
    result = graph.query(
        """
        MATCH (p:Principle)-[:HAS_PRACTICE]->(pr:Practice)-[:HAS_KEY_INDICATOR]->(ki:KeyIndicator)
        WHERE p.id = $principle_id
        RETURN p, pr, ki;
        """,
        params={"principle_id": principle_id}
    )

    for record in result:
        # principle = record["p"]
        practice = record["pr"]
        key_indicator = record["ki"]

        if key_indicator:
            # print(f"Principle: {principle['name']}")
            print(f"  Practice: {practice['id']}")
            print(f"    Key Indicator: {key_indicator['question']}")

for principle in principles:
    traverse_and_print_key_indicators(graph, principle["id"])

In [None]:
import os
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document as LCDocument
from langchain.llms import OpenAI
from dotenv import load_dotenv

load_dotenv()

# Set OpenAI API Key
openai_api_key = os.environ.get("OPENAI_API_KEY")

# Split document into smaller chunks for embeddings
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_text(board_memo_text)

# Convert chunks into LangChain Document objects
docs = [LCDocument(page_content=chunk) for chunk in chunks]

# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings()

# Store embeddings in FAISS vector database
vector_store_memo = FAISS.from_documents(docs, embeddings)

# # Save FAISS index for later use
# vector_store_memo.save_local("faiss_index_memo")

# # Load FAISS index (optional, for retrieval)
# vector_store_memo = FAISS.load_local("faiss_index_memo", embeddings, allow_dangerous_deserialization=True)

In [None]:
from pprint import pprint
from pydantic import BaseModel, Field
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import PydanticOutputParser
import json


class ComplianceReport(BaseModel):
    status: str = Field(description="The compliance status: 'Pass' or 'Fail'")
    causality: str = Field(description="A to-the-point concise reason (Cause) for the compliance status (Effect)")
    explanation: str = Field(description="Explanation of the compliance status finding")
    corrective_measures: str = Field(
        description="Suggested corrective measures if status is 'Fail', or empty if 'Pass'"
    )


# Create a parser for the ComplianceReport model
parser = PydanticOutputParser(pydantic_object=ComplianceReport)

# Define the prompt template with instructions for JSON formatting
prompt_template = PromptTemplate(
    template="""
You are an expert compliance analyst tasked with evaluating the compliance status of the best practice based on the provided context. 
The context consists of relevant remarks from board members. Clearly state the status in your response as "Pass" or "Fail" at the top.
You will be provided with a key indicator and a practice statement. You need to evaluate the compliance status of the practice based on the key indicator.

### Best Practice:
{practice}

### Key Indicator:
{key_indicator}

### Context:
{context}

### Question:
Based on the context, does the organization comply with this best practice? Provide reasoning if it doesn't and corrective measures. Your description should be easy to comprehend. If you don't find any relevant information, you can state that as well.

Format your response as a JSON object with the following fields:
- status: "Pass" or "Fail"
- causality: A to-the-point concise reason (Cause) for the compliance status (Effect)
- explanation: A detailed explanation of why the organization passes or fails
- corrective_measures: Suggested actions if failing, or empty string if passing

### Answer:
""",
    input_variables=["practice", "key_indicator", "context"],
)

# Initialize the LLM with JSON output format
llm = ChatOpenAI(
    model_name="gpt-4o-mini", model_kwargs={"response_format": {"type": "json_object"}}
)

# Create the compliance chain
compliance_chain = LLMChain(
    llm=llm, prompt=prompt_template, output_key="compliance_report"
)


# Function to run the compliance check with vector retrieval
def check_compliance(practice_statement, key_indicator, vector_store, k=5):
    # Retrieve relevant documents
    retrieved_docs = vector_store.similarity_search_with_score(practice_statement, k=k)

    # Format retrieved documents into a structured context
    context = "\n\n".join(
        [
            f"Document {i+1} (score: {score}):\n{doc.page_content}"
            for i, (doc, score) in enumerate(retrieved_docs)
        ]
    )

    # Run the compliance chain
    result = compliance_chain.invoke(
        {"practice": practice_statement, "key_indicator": key_indicator, "context": context}
    )

    # Parse the JSON string into a ComplianceReport object
    try:
        json_str = result["compliance_report"]
        parsed_json = json.loads(json_str)
        return ComplianceReport(**parsed_json)
    except Exception as e:
        print(f"Failed to parse result into ComplianceReport model. Error: {e}")
        print("Raw result:")
        pprint(result["compliance_report"])
        return None


# Example usage
practice = "Practice 4.1: Policies, processes, and procedures for managing cyber breaches internally are established and reviewed at least annually."
key_indicator = "Not Provided"
report = check_compliance(practice, key_indicator, vector_store_memo, k=5)
if report:
    print(f"Status: {report.status}")
    print(f"Cause: {report.causality}")
    print(f"Explanation: {report.explanation}")
    print(f"Corrective measures: {report.corrective_measures}")

In [None]:
for principle in principles:
    result = graph.query(
        """
        MATCH (p:Principle)-[:HAS_PRACTICE]->(pr:Practice)-[:HAS_KEY_INDICATOR]->(ki:KeyIndicator)
        WHERE p.id = $principle_id
        RETURN p, pr, ki;
        """,
        params={"principle_id": principle["id"]},
    )

    for record in result:
        # principle = record["p"]
        practice = record["pr"]
        key_indicator = record["ki"]

        if key_indicator:
            # print(f"Principle: {principle['name']}")
            print(f"**Practice:** {practice['id']}")
            print(f"**Key Indicator:** {key_indicator['question']}")

            report = check_compliance(practice["description"], key_indicator["question"], vector_store_memo)
            if report:
                print(f"**Status:** {report.status}")
                print(f"**Cause:** {report.causality}")
                print(f"**Explanation:** {report.explanation}")
                print(f"**Corrective measures:** {report.corrective_measures}")
                print()

## Experimentation with confidence scores

In [None]:
from pprint import pprint
from pydantic import BaseModel, Field
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import PydanticOutputParser
import json


class ComplianceReport(BaseModel):
    status: str = Field(description="The compliance status: 'Pass' or 'Fail'")

# Create a parser for the ComplianceReport model
parser = PydanticOutputParser(pydantic_object=ComplianceReport)

# Define the prompt template with instructions for JSON formatting
prompt_template = PromptTemplate(
    template="""
You are an expert compliance analyst tasked with evaluating the compliance status of the best practice based on the provided context. 
The context consists of relevant remarks from board members. Clearly state the status in your response as "Pass" or "Fail" at the top.
You will be provided with a key indicator and a practice statement. You need to evaluate the compliance status of the practice based on the key indicator.

### Best Practice:
{practice}

### Key Indicator:
{key_indicator}

### Context:
{context}

### Question:
Based on the context, does the organization comply with this best practice? If you don't find any relevant information, you can state that as "Fail".

Format your response as a JSON object with the following fields:
- status: "Pass" or "Fail"

### Answer:
""",
    input_variables=["practice", "key_indicator", "context"],
)

# Initialize the LLM with JSON output format
llm = ChatOpenAI(
    model_name="gpt-4o-mini", logprobs=True, model_kwargs={"response_format": {"type": "json_object"}}
)

compliance_chain = prompt_template | llm


# Function to run the compliance check with vector retrieval
def check_compliance(practice_statement, key_indicator, vector_store, k=5):
    # Retrieve relevant documents
    retrieved_docs = vector_store.similarity_search_with_score(practice_statement, k=k)

    # Format retrieved documents into a structured context
    context = "\n\n".join(
        [
            f"Document {i+1} (score: {score}):\n{doc.page_content}"
            for i, (doc, score) in enumerate(retrieved_docs)
        ]
    )

    # Run the compliance chain
    result = compliance_chain.invoke(
        {"practice": practice_statement, "key_indicator": key_indicator, "context": context}
    )

    return result
    # # Parse the JSON string into a ComplianceReport object
    # try:
    #     json_str = result["compliance_report"]
    #     parsed_json = json.loads(json_str)
    #     return ComplianceReport(**parsed_json)
    # except Exception as e:
    #     print(f"Failed to parse result into ComplianceReport model. Error: {e}")
    #     print("Raw result:")
    #     pprint(result["compliance_report"])
    #     return None


# Example usage
practice = "Practice 4.1: Policies, processes, and procedures for managing cyber breaches internally are established and reviewed at least annually."
key_indicator = "Not Provided"
report = check_compliance(practice, key_indicator, vector_store_memo, k=5)
# if report:
#     print(f"Status: {report.status}")
#     print(f"Explanation: {report.explanation}")
#     print(f"Corrective measures: {report.corrective_measures}")

In [None]:
import numpy as np

parser.parse(report.content)
for logprob in report.response_metadata['logprobs']['content']:
    if logprob['token'] == 'Pass' or logprob['token'] == 'Fail':
        print(f"Prob('{logprob['token']}'): {np.exp(logprob['logprob'])}")

In [None]:
import numpy as np
for principle in principles:
    result = graph.query(
        """
        MATCH (p:Principle)-[:HAS_PRACTICE]->(pr:Practice)-[:HAS_KEY_INDICATOR]->(ki:KeyIndicator)
        WHERE p.id = $principle_id
        RETURN p, pr, ki;
        """,
        params={"principle_id": principle["id"]},
    )

    for record in result:
        # principle = record["p"]
        practice = record["pr"]
        key_indicator = record["ki"]

        if key_indicator:
            # print(f"Principle: {principle['name']}")
            print(f"**Practice:** {practice['id']}")
            print(f"**Key Indicator:** {key_indicator['question']}")

            report = check_compliance(practice["description"], key_indicator["question"], vector_store_memo)
            for logprob in report.response_metadata['logprobs']['content']:
                if logprob['token'] == 'Pass' or logprob['token'] == 'Fail':
                    confidence = f"Prob('{logprob['token']}'): {np.exp(logprob['logprob']):.4f}"
            report = parser.parse(report.content)
            
            print(f"**Status:** {report.status}")
            print(confidence)
            print()

In [None]:
report.status

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage

# Initialize OpenAI model with logprobs enabled
llm = ChatOpenAI(model_name="gpt-4-turbo",
                 temperature=0,
                 openai_api_key=openai_api_key,
                 model_kwargs={"logprobs": True})  # Request log probabilities

# Define a prompt
message = HumanMessage(content="What is the capital of France?")

# Get response with log probabilities
response = llm([message])

# Print full response including log probabilities
# pprint(response.response_metadata)

# Extract the log probabilities from the response
response_logprobs = response.response_metadata["logprobs"]["content"]
pprint(response_logprobs)

In [None]:
from openai import OpenAI
import os
from llm_confidence.logprobs_handler import LogprobsHandler

# Initialize the LogprobsHandler
logprobs_handler = LogprobsHandler()

def get_completion(
        messages: list[dict[str, str]],
        model: str = "gpt-4o",
        max_tokens=500,
        temperature=0,
        stop=None,
        seed=42,
        response_format=None,
        logprobs=None,
        top_logprobs=None,
):
    params = {
        "model": model,
        "messages": messages,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "stop": stop,
        "seed": seed,
        "logprobs": logprobs,
        "top_logprobs": top_logprobs,
    }
    if response_format:
        params["response_format"] = response_format

    completion = client.chat.completions.create(**params)
    return completion

# Set up your OpenAI client with your API key
client = OpenAI(api_key=openai_api_key)

# Define a prompt for completion
response_raw = get_completion(
    [{'role': 'user', 'content': 'Tell me the name of capital of Pakistan, and return the response in JSON format.'}],
    logprobs=True,
    response_format={'type': 'json_object'}
)

# Print the output
print(response_raw.choices[0].message.content)

# Extract the log probabilities from the response
response_logprobs = response_raw.choices[0].logprobs.content if hasattr(response_raw.choices[0], 'logprobs') else []

# Format the logprobs
logprobs_formatted = logprobs_handler.format_logprobs(response_logprobs)

# Process the log probabilities to get confidence scores
confidence = logprobs_handler.process_logprobs(
    logprobs_formatted, 
)

# Print the confidence scores
print(confidence)

In [None]:
logprobs_formatted
# logprobs_handler.calculate_words_probas(logprobs_formatted)
response_logprobs

In [None]:
logprobs_formatted = []
for logprob in response_logprobs:
    logprob_formatted = {'token': logprob.token, 'logprob': logprob.logprob,
                            'log_topprobs': [{'token': log_topprob.token, 'logprob': log_topprob.logprob}
                                            for log_topprob in logprob.top_logprobs]}
    logprobs_formatted.append(logprob_formatted)
logprobs_formatted

In [None]:
import pandas as pd
import numpy as np

probas_df = pd.DataFrame({'token': [i['token'] for i in logprobs_formatted],
                                  'logprob': [i['logprob'] for i in logprobs_formatted]})

# Combine tokens into key-value pairs
# Assuming tokens that form key-value pairs are sequential
key_value_pairs = []
current_pair = []
for idx, row in probas_df.iterrows():
    token = str(row['token'])
    if token.strip() != '' and not token.strip() in ['{', '}']:
        current_pair.append(idx)
    # Check if the token likely ends a key-value pair
    if token.endswith(',\n') or token.endswith(']\n') or token.strip().endswith(',') or token.strip().endswith(
            '}') or token.endswith('"}') or token.endswith("'}") or token.endswith(',"') or token.endswith(
        ",'") or token.endswith('",\n') or token.endswith("',\n"):
        if len(current_pair) > 0:
            key_value_pairs.append(current_pair)
        current_pair = []

# Calculate key-value pair probabilities
pair_probs = []
for pair in key_value_pairs:
    print(probas_df.loc[pair, 'token'])
    pair_logprob = probas_df.loc[pair, 'logprob'].sum()
    print(pair_logprob)
    pair_prob = np.exp(pair_logprob)
    pair_probs.append((''.join(probas_df.loc[pair, 'token']), pair_prob))

In [None]:
pair_probs