### Initial Setup

In [None]:
!pip install -U -q \
    pymupdf \
    sentence-transformers \
    transformers \
    langchain \
    langchain-chroma \
    langchain-community \
    langchain-google-genai \
    faiss-cpu \
    google-generativeai \
    PyPDF2 \
    anthropic \
    langchain-anthropic \
    langchain-ollama \
    langchain_huggingface \
    gradio

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m70.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m75.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m74.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m30.3 MB/s[0m eta [36m0:00:00[

In [None]:
# Standard library imports
import io
import json
import os
from typing import Dict, List
from collections import defaultdict
import re

# Third-party data processing
import numpy as np
import pandas as pd

# PDF processing
import PyPDF2

# Google Colab and Drive
from google.colab import auth, drive, userdata
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
import googleapiclient.http

# AI/ML libraries
from anthropic import Anthropic
import google.generativeai as genai
from sentence_transformers import SentenceTransformer
from transformers import pipeline

# LangChain components
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyMuPDFLoader
from langchain.schema import Document
from langchain.schema import HumanMessage
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_anthropic import ChatAnthropic
from langchain_community.llms import Ollama
from langchain_ollama import OllamaLLM
from langchain_huggingface import HuggingFaceEmbeddings

# Necessary for Gradio
import gradio as gr
from datetime import datetime
import threading
import time
import ast


### Pre Processing and Initial Chunking

In [None]:
# Retrieve files from Team Stompa on Google Drive
auth.authenticate_user()
service = build('drive', 'v3')

# Provide IDs for each folder
Transcripts = "1y1-7h6JbgFJ5l4lGiz2kFNhhzeUTsaQ6"
Basel = "1mPeX4pbjqEfsZuIc5HpT6dA9s_u_DLnD"
Principles = "1_a8-Rqv0dONJj65Rr4JPmpYlGgsbquQP"
Topic_Modelled_Results = "1MMtFQAiKc_EuDK8Q3dgFouPQSrXWhhHb"

# Create Folder Groups for future use
all_dir = [Transcripts, Basel, Principles]
transcripts_dir = [Transcripts]
principles_dir = [Principles]
topic_modelled_dir = [Topic_Modelled_Results]

# Folder, File and PDF functions
def get_files_from_folder(folder_id):
    """Retrieve all files from a specific folder"""
    try:
        files = service.files().list(
            q=f"'{folder_id}' in parents",
            fields="files(id, name, mimeType)"
        ).execute()
        return files.get('files', [])
    except Exception as e:
        print(f"Error accessing folder {folder_id}: {e}")
        return []

def iterate_thru_multiple_folders(folder_ids):
    """Iterate through multiple folders and retrieve files"""
    all_files = []
    for folder_id in folder_ids:
        folder_files = get_files_from_folder(folder_id)
        all_files.extend(folder_files)
    return all_files

def load_pdf_docs(file_list, drive_service, temp_dir="/tmp"):
    all_docs = []
    errors = []

    print(f"Loading PDFs")

    for file_info in file_list:
        # Only process PDF files
        if file_info['mimeType'] == 'application/pdf':
            file_id = file_info['id']
            file_name = file_info['name']
            temp_file_path = None

            try:
                # Download file from Google Drive
                request = drive_service.files().get_media(fileId=file_id)
                temp_file_path = os.path.join(temp_dir, file_name)

                with io.FileIO(temp_file_path, 'wb') as temp_file:
                    downloader = MediaIoBaseDownload(temp_file, request)
                    done = False
                    while done is False:
                        status, done = downloader.next_chunk()

                # Check if file was downloaded and has content
                if not os.path.exists(temp_file_path):
                    errors.append(f"Failed to download {file_name}: File not created")
                    continue

                if os.path.getsize(temp_file_path) == 0:
                    errors.append(f"Skipped {file_name}: File is empty")
                    continue

                # Load the downloaded PDF
                loader = PyMuPDFLoader(temp_file_path)
                docs = loader.load()

                # Check if any documents were loaded
                if not docs:
                    errors.append(f"Warning: {file_name} loaded but contains no extractable content")
                else:
                    all_docs.extend(docs)
                    print(f"✓ Successfully loaded {file_name} ({len(docs)} pages)")

            except Exception as e:
                error_msg = f"Error processing {file_name}: {str(e)}"
                errors.append(error_msg)
                print(f"✗ {error_msg}")

            finally:
                # Clean up temporary file if it exists
                if temp_file_path and os.path.exists(temp_file_path):
                    try:
                        os.remove(temp_file_path)
                    except Exception as cleanup_error:
                        errors.append(f"Failed to clean up {temp_file_path}: {str(cleanup_error)}")

    # Summary
    print(f"\nProcessing complete:")
    print(f"  Successfully loaded: {len(all_docs)} total documents")
    print(f"  Errors encountered: {len(errors)}")
    print()

    if errors:
        print("\nErrors:")
        for error in errors:
            print(f"  - {error}")
    print()

    return all_docs

# Set up lists of files for further parsing later
transcript_files = iterate_thru_multiple_folders(transcripts_dir)
principle_files = iterate_thru_multiple_folders(principles_dir)
topic_modelled_files = iterate_thru_multiple_folders(topic_modelled_dir)

In [None]:
# Read all pdfs from the list of docs
# all_transcript_docs = load_pdf_docs(transcript_files, service)
principle_docs = load_pdf_docs(principle_files, service)
topic_modelled_docs = load_pdf_docs(topic_modelled_files, service)

Loading PDFs
✓ Successfully loaded principle_9.pdf (3 pages)
✓ Successfully loaded principle_8.pdf (2 pages)
✓ Successfully loaded principle_7.pdf (2 pages)
✓ Successfully loaded principle_6.pdf (1 pages)
✓ Successfully loaded principle_5.pdf (2 pages)
✓ Successfully loaded principle_4.pdf (1 pages)
✓ Successfully loaded principle_3.pdf (1 pages)
✓ Successfully loaded principle_29.pdf (3 pages)
✓ Successfully loaded principle_28.pdf (1 pages)
✓ Successfully loaded principle_27.pdf (2 pages)
✓ Successfully loaded principle_26.pdf (3 pages)
✓ Successfully loaded principle_25.pdf (4 pages)
✓ Successfully loaded principle_24.pdf (3 pages)
✓ Successfully loaded principle_23.pdf (2 pages)
✓ Successfully loaded principle_22.pdf (2 pages)
✓ Successfully loaded principle_21.pdf (2 pages)
✓ Successfully loaded principle_20.pdf (2 pages)
✓ Successfully loaded principle_2.pdf (2 pages)
✓ Successfully loaded principle_19.pdf (2 pages)
✓ Successfully loaded principle_18.pdf (4 pages)
✓ Successfully 

In [None]:
# Apply Basic Chunking
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
chunked_principles = splitter.split_documents(principle_docs)
# chunked_transcripts = splitter.split_documents(all_transcript_docs)
chunked_topic_modelled = splitter.split_documents(topic_modelled_docs)

### LLM Config

In [None]:
# Initialize various LLM models
claude = ChatAnthropic(
    model="claude-3-5-sonnet-20241022",
    anthropic_api_key=userdata.get('ANTHROPIC_API_KEY')
    )

gemini = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    google_api_key=userdata.get('GOOGLE_API_KEY'),
    temperature=0.7
)

ollama = OllamaLLM(
    model="phi4:latest",
    base_url="https://humble-ladybug-actually.ngrok-free.app/",
    temperature=0.7
)

In [None]:
# Use native LLM APIs to test connection and capability
def get_responses(question, model):

    print(f"Question: {question}\n")

    # Get responses from model
    if model == "claude":
        response = claude.invoke([HumanMessage(content=question)])
        print(f"Claude's Response:\n{response.content}\n")

    elif model == "gemini":
        response = gemini.invoke([HumanMessage(content=question)])
        print(f"Gemini's Response:\n{response.content}\n")

    elif model == "ollama":
        response = ollama.invoke(question)
        print(f"Ollama's Response:\n{response}\n")

    else:
        print("Invalid model. Choose from 'claude', 'gemini', or 'ollama'.")

    return response

In [None]:
# Create Prompt to ask the LLM to identify particular questions that may be asked
question_prompt = """
You are a financial risk specialist working with large banks such as JP Morgan and Citi. Your job is to come up with questions about the financial risk of the bank and any things that could concern central banks or regulators such as the Bank of England.
You will provide ten questions to the bank in the form of a list as shown below:
["How has the liquidity of the bank been assessed for the next twelve months?", "What is the outlook for cash generation?", ...]

Requirements:
- Your output must be a valid python list containing the questions that you have generated.
- The questions should consider specific financial risk of the banks in question as well as any changes in the regulatory environment.
"""

In [None]:
# Test prompt with claude
get_responses(question_prompt, "claude")

Question: 
You are a financial risk specialist working with large banks such as JP Morgan and Citi. Your job is to come up with questions about the financial risk of the bank and any things that could concern central banks or regulators such as the Bank of England.
You will provide ten questions to the bank in the form of a list as shown below:
["How has the liquidity of the bank been assessed for the next twelve months?", "What is the outlook for cash generation?", ...]

Requirements:
- Your output must be a valid python list containing the questions that you have generated.
- The questions should consider specific financial risk of the banks in question as well as any changes in the regulatory environment.


Claude's Response:
[
    "How has the bank stress-tested its capital adequacy ratios against potential economic downturns in 2024-2025?",
    "What measures are in place to manage exposure to commercial real estate given current market vulnerabilities?",
    "How is the bank addr

AIMessage(content='[\n    "How has the bank stress-tested its capital adequacy ratios against potential economic downturns in 2024-2025?",\n    "What measures are in place to manage exposure to commercial real estate given current market vulnerabilities?",\n    "How is the bank addressing potential risks from rising interest rates on its loan portfolio quality?",\n    "What is the bank\'s exposure to emerging market debt and how is this risk being mitigated?",\n    "How does the bank plan to comply with new Basel III endgame requirements while maintaining profitability?",\n    "What contingency plans are in place for potential cyber security breaches affecting critical payment systems?",\n    "How is the bank managing climate-related transition risks in its investment and lending portfolios?",\n    "What is the current level of exposure to non-performing loans and how is this being provisioned?",\n    "How is the bank preparing for upcoming ESG disclosure requirements and regulatory st

In [None]:
# Test prompt with Gemini
get_responses(question_prompt, "gemini")

Question: 
You are a financial risk specialist working with large banks such as JP Morgan and Citi. Your job is to come up with questions about the financial risk of the bank and any things that could concern central banks or regulators such as the Bank of England.
You will provide ten questions to the bank in the form of a list as shown below:
["How has the liquidity of the bank been assessed for the next twelve months?", "What is the outlook for cash generation?", ...]

Requirements:
- Your output must be a valid python list containing the questions that you have generated.
- The questions should consider specific financial risk of the banks in question as well as any changes in the regulatory environment.


Gemini's Response:
```python
[
"How has the liquidity of the bank been assessed for the next twelve months, considering potential deposit outflows under stressed scenarios and the availability of contingent funding?",
"What is the outlook for cash generation, factoring in anticip

AIMessage(content='```python\n[\n"How has the liquidity of the bank been assessed for the next twelve months, considering potential deposit outflows under stressed scenarios and the availability of contingent funding?",\n"What is the outlook for cash generation, factoring in anticipated changes in interest rates, loan growth, and potential economic slowdowns impacting asset quality?",\n"What are the bank\'s projections for credit losses across different loan portfolios (e.g., commercial real estate, consumer loans) under various macroeconomic scenarios, and how do these projections compare to historical performance and industry benchmarks?",\n"How is the bank managing its exposure to interest rate risk, particularly in light of recent and anticipated future interest rate hikes by central banks, and what is the potential impact on net interest margin and asset values?",\n"What is the bank\'s strategy for managing operational risk, including cybersecurity threats, fraud, and model risk, 

In [None]:
# Non RAG LLM Query
get_responses("What is principle 9 of the BIS framework?", "claude")

Question: What is principle 9 of the BIS framework?

Claude's Response:
Principle 9 of the Basel Committee on Banking Supervision's Core Principles (BIS framework) deals with "Supervisory techniques and tools." This principle states that supervisors should use an appropriate range of techniques and tools to implement their supervisory approach and deploy supervisory resources on a proportionate basis, taking into account the risk profile and systemic importance of banks. It emphasizes the importance of having effective supervisory methodologies and both on-site and off-site supervision capabilities to assess banks' conditions, risks, internal control systems, and corrective measures when necessary.



AIMessage(content='Principle 9 of the Basel Committee on Banking Supervision\'s Core Principles (BIS framework) deals with "Supervisory techniques and tools." This principle states that supervisors should use an appropriate range of techniques and tools to implement their supervisory approach and deploy supervisory resources on a proportionate basis, taking into account the risk profile and systemic importance of banks. It emphasizes the importance of having effective supervisory methodologies and both on-site and off-site supervision capabilities to assess banks\' conditions, risks, internal control systems, and corrective measures when necessary.', additional_kwargs={}, response_metadata={'id': 'msg_01E1CHJRK9qSmStBu69GGCww', 'model': 'claude-3-5-sonnet-20241022', 'stop_reason': 'end_turn', 'stop_sequence': None, 'usage': {'cache_creation_input_tokens': 0, 'cache_read_input_tokens': 0, 'input_tokens': 19, 'output_tokens': 122, 'server_tool_use': None, 'service_tier': 'standard'}, 'mo

### RAG Setup

In [None]:
# Basic RAG Setup

# Set the embedding function and initialise the vector store
embedding_function = HuggingFaceEmbeddings(model_name='all-mpnet-base-v2')
vectorstore = FAISS.from_documents(chunked_principles, embedding_function)

# Basic Retriever Settings
qa_chain = RetrievalQA.from_chain_type(
    llm=claude,
    retriever=vectorstore.as_retriever(),
    return_source_documents=True
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Test the Basic RAG
query = "What is principle 9 of the BIS framework?"
result = qa_chain(query)

print("\nAnswer:\n", result["result"])
for doc in result["source_documents"]:
    print(f"\nSource: {doc.metadata['source']}")
    print(doc.page_content[:300], "...")

  result = qa_chain(query)



Answer:
 From the provided context, I cannot determine what Principle 9 of the BIS framework is. While the context includes information about several principles (including Principles 3, 12, 13, 27, and 29), it does not contain specific information about Principle 9. To accurately state what Principle 9 is, I would need context that directly addresses that principle.

Source: /tmp/principle_3.pdf
Principle 3 - Cooperation and collaboration
40.8 Principle 3: Laws, regulations or other arrangements provide a framework for
cooperation and collaboration with relevant domestic authorities and foreign
supervisors. These arrangements reflect the need to protect confidential information.6
Footnotes  ...

Source: /tmp/principle_26.pdf
investments (including measures for the prevention and early detection and reporting of
misuse, such as fraud, embezzlement, unauthorised trading and computer intrusion).
More specifically, these controls address: (a) organisational structure: definitions of
dutie

### RAG Tuning

In [None]:
# Combine principles back to one doc per principle

def combine_pages_to_principles(page_docs):
    # Group pages by their source (principle file)
    principle_groups = defaultdict(list)

    for doc in page_docs:
        # Use the source file path as the grouping key
        source = doc.metadata.get('source', 'unknown')
        principle_groups[source].append(doc)

    # Combine pages for each principle
    combined_principles = []

    for source, pages in principle_groups.items():
        # Sort pages by page number to ensure correct order
        pages.sort(key=lambda x: x.metadata.get('page', 0))

        # Combine all page content with double newlines for separation
        combined_content = '\n\n'.join([page.page_content for page in pages])

        # Create combined metadata from the first page
        combined_metadata = pages[0].metadata.copy()

        # Remove page-specific metadata and update with combined info
        combined_metadata.pop('page', None)  # Remove individual page number
        combined_metadata['combined_pages'] = len(pages)
        combined_metadata['page_range'] = f"0-{len(pages)-1}"

        # Extract principle name from source if possible
        if 'principle_' in source:
            principle_num = source.split('principle_')[1].split('.')[0]
            combined_metadata['principle_number'] = principle_num

        # Create the combined document
        combined_doc = Document(
            page_content=combined_content,
            metadata=combined_metadata
        )

        combined_principles.append(combined_doc)

    return combined_principles

# Sort by principle number for better organization
def extract_principle_number(doc):
    """Extract principle number for sorting"""
    principle_num = doc.metadata.get('principle_number')
    if principle_num:
        try:
            return int(principle_num)
        except ValueError:
            return float('inf')  # Put non-numeric at the end
    return float('inf')

In [None]:
# Update Principles Docs with the upgraded strategy
combined_principles = combine_pages_to_principles(principle_docs)
combined_principles.sort(key=extract_principle_number)

In [None]:
# Retrieve sizes of principle docs - really just for info
for doc in combined_principles:
    principle_num = doc.metadata.get('principle_number', 'Unknown')
    pages = doc.metadata.get('combined_pages', 0)
    length = len(doc.page_content)
    tokens = len(doc.page_content.split())
    print(f"Principle {principle_num}: {pages} pages, {length} characters, {tokens} tokens")

Principle 1: 2 pages, 3970 characters, 598 tokens
Principle 2: 2 pages, 4913 characters, 719 tokens
Principle 3: 1 pages, 2907 characters, 403 tokens
Principle 4: 1 pages, 1442 characters, 224 tokens
Principle 5: 2 pages, 5389 characters, 805 tokens
Principle 6: 1 pages, 2384 characters, 353 tokens
Principle 7: 2 pages, 2803 characters, 407 tokens
Principle 8: 2 pages, 5187 characters, 726 tokens
Principle 9: 3 pages, 6658 characters, 961 tokens
Principle 10: 2 pages, 4483 characters, 638 tokens
Principle 11: 2 pages, 4899 characters, 728 tokens
Principle 12: 2 pages, 4223 characters, 611 tokens
Principle 13: 3 pages, 5187 characters, 736 tokens
Principle 14: 3 pages, 6222 characters, 891 tokens
Principle 15: 4 pages, 12100 characters, 1751 tokens
Principle 16: 3 pages, 6764 characters, 1014 tokens
Principle 17: 2 pages, 6099 characters, 843 tokens
Principle 18: 4 pages, 8544 characters, 1194 tokens
Principle 19: 2 pages, 5443 characters, 770 tokens
Principle 20: 2 pages, 5030 characte

In [None]:
# Upgraded chunking strategy with greater markup and metadata
def create_principle_aware_chunks(principle_docs, chunk_size=500, chunk_overlap=50):
    # Custom separators for banking principles - hierarchical breakdown
    separators = [
        "\n\n# ",          # Major sections
        "\nPrinciple ",     # New principles
        "\nEssential criteria:", # Essential criteria sections
        "\nAdditional criterion:", # Additional criteria
        "\nFootnotes",      # Footnotes sections
        "\n\n",            # Paragraph breaks
        "\n",              # Line breaks
        ". ",              # Sentence endings
        " ",               # Word boundaries
        ""                 # Character level (last resort)
    ]

    # Configure the splitter
    text_splitter = RecursiveCharacterTextSplitter(
        separators=separators,
        chunk_size=chunk_size * 4,
        chunk_overlap=chunk_overlap,
        length_function=len,
        is_separator_regex=False,
    )

    all_chunks = []

    for doc in principle_docs:
        # Pre-process the document to identify key sections
        enhanced_content = enhance_content_structure(doc.page_content)

        # Create chunks
        chunks = text_splitter.split_text(enhanced_content)

        for i, chunk in enumerate(chunks):
            # Extract semantic context from the chunk
            chunk_metadata = doc.metadata.copy()
            chunk_metadata.update({
                'chunk_id': i,
                'total_chunks': len(chunks),
                'chunk_type': identify_chunk_type(chunk),
                'has_criteria': 'essential criteria' in chunk.lower() or 'additional criterion' in chunk.lower(),
                'has_footnotes': 'footnote' in chunk.lower(),
                'principle_section': extract_principle_section(chunk),
            })

            # Create Document object
            chunk_doc = Document(
                page_content=clean_chunk_content(chunk),
                metadata=chunk_metadata
            )
            all_chunks.append(chunk_doc)

    return all_chunks

def enhance_content_structure(content):
    """Add structure markers to help with semantic chunking."""

    # Add clear markers for different sections
    content = re.sub(r'(Essential criteria:)', r'\n\n### \1\n', content)
    content = re.sub(r'(Additional criterion:)', r'\n\n### \1\n', content)
    content = re.sub(r'(Footnotes?\s*\d*)', r'\n\n### \1\n', content)
    content = re.sub(r'\((\d+)\)', r'\n(\1)', content)  # Separate numbered criteria

    return content

def identify_chunk_type(chunk):
    """Identify the type of content in the chunk."""
    chunk_lower = chunk.lower()

    if 'principle' in chunk_lower and ':' in chunk:
        return 'principle_definition'
    elif 'essential criteria' in chunk_lower:
        return 'essential_criteria'
    elif 'additional criterion' in chunk_lower:
        return 'additional_criteria'
    elif 'footnote' in chunk_lower:
        return 'footnote'
    elif re.search(r'\(\d+\)', chunk):
        return 'numbered_criteria'
    else:
        return 'general_content'

def extract_principle_section(chunk):
    """Extract which specific section this chunk relates to."""

    # Look for numbered criteria
    criteria_match = re.search(r'\((\d+)\)', chunk)
    if criteria_match:
        return f"criteria_{criteria_match.group(1)}"

    # Look for specific topics
    if 'supervisory approach' in chunk.lower():
        return 'supervisory_approach'
    elif 'risk assessment' in chunk.lower():
        return 'risk_assessment'
    elif 'internal audit' in chunk.lower():
        return 'internal_audit'
    elif 'stress test' in chunk.lower():
        return 'stress_testing'

    return 'general'

def clean_chunk_content(chunk):
    """Clean up chunk content for better retrieval."""

    # Remove excessive whitespace
    chunk = re.sub(r'\n\s*\n\s*\n', '\n\n', chunk)
    chunk = re.sub(r'[ \t]+', ' ', chunk)

    # Ensure sentences are complete if possible
    chunk = chunk.strip()

    return chunk

# Add cross-references between chunks
def add_cross_references(chunks):
    """Add references to related chunks for better context."""

    principle_chunks = {}

    # Group chunks by principle
    for chunk in chunks:
        principle_num = chunk.metadata.get('principle_number', 'unknown')
        if principle_num not in principle_chunks:
            principle_chunks[principle_num] = []
        principle_chunks[principle_num].append(chunk)

    # Add cross-references
    for principle_num, principle_chunk_list in principle_chunks.items():
        for i, chunk in enumerate(principle_chunk_list):
            # Add references to adjacent chunks
            if i > 0:
                chunk.metadata['prev_chunk_id'] = principle_chunk_list[i-1].metadata['chunk_id']
            if i < len(principle_chunk_list) - 1:
                chunk.metadata['next_chunk_id'] = principle_chunk_list[i+1].metadata['chunk_id']

            chunk.metadata['related_chunks'] = len(principle_chunk_list)

    return chunks

In [None]:
# Re-chunk docs
re_chunked_docs = create_principle_aware_chunks(
    combined_principles,
    chunk_size=500,  # Adjust based on your needs
    chunk_overlap=50
)

# Apply cross-references
final_chunks = add_cross_references(re_chunked_docs)

print(f"Created {len(final_chunks)} chunks from {len(combined_principles)} principles")

# Analyze chunk distribution
chunk_types = {}
for chunk in final_chunks:
    chunk_type = chunk.metadata['chunk_type']
    chunk_types[chunk_type] = chunk_types.get(chunk_type, 0) + 1

print("Chunk type distribution:")
for chunk_type, count in chunk_types.items():
    print(f"  {chunk_type}: {count}")

Created 133 chunks from 29 principles
Chunk type distribution:
  principle_definition: 40
  essential_criteria: 19
  footnote: 18
  numbered_criteria: 38
  general_content: 16
  additional_criteria: 2


In [None]:
# Save docs in case I want to pick up at this point
import pickle

# Save documents
with open('re_chunked_docs.pkl', 'wb') as f:
    pickle.dump(re_chunked_docs, f)

In [None]:
# # Load documents (optionally)
# with open('re_chunked_docs.pkl', 'rb') as f:
#     re_chunked_docs = pickle.load(f)

In [None]:
# Upgraded RAG Setup
embedding_function = HuggingFaceEmbeddings(model_name='all-mpnet-base-v2',
                                           encode_kwargs={'normalize_embeddings': True})
vectorstore2 = FAISS.from_documents(re_chunked_docs, embedding_function)

retriever = vectorstore2.as_retriever(
    search_type="mmr",  # Maximum Marginal Relevance for diversity
    search_kwargs={
        "k": 15,
        "fetch_k": 40,
        "lambda_mult": 0.5
    }
)

# Langchain flow
qa_chain = RetrievalQA.from_chain_type(
    llm=gemini,
    retriever=retriever,
    return_source_documents=True
)

In [None]:
# Test the Upgraded RAG
query = "What is principle 9 of the BIS framework?"
result = qa_chain(query)

print("\nAnswer:\n", result["result"])
for doc in result["source_documents"]:
    print(f"\nSource: {doc.metadata['source']}")
    print(doc.page_content[:300], "...")


Answer:
 Principle 9 of the BIS framework is about Supervisory techniques and tools. The supervisor uses an appropriate range of techniques and tools to implement the supervisory approach and deploys supervisory resources on a proportionate basis, considering the risk profile and systemic importance of banks.

Source: /tmp/principle_3.pdf
Principle 3 - Cooperation and collaboration
40.8 Principle 3: Laws, regulations or other arrangements provide a framework for
cooperation and collaboration with relevant domestic authorities and foreign
supervisors. These arrangements reflect the need to protect confidential information.6

### Footn ...

Source: /tmp/principle_9.pdf
statements and accounts; (b) business model analysis; (c) horizontal peer reviews; (d)
analysis of corporate governance, including risk management and internal control
systems; (e) reviews of the outcome of stress tests undertaken by the banks; and (f)
assessments of the adequacy of banks' capital a ...

Source: /tmp/prin

### Add Topic Modelled Results to the Vector Store

In [None]:
class ClaudePDFChunker:
    def __init__(self, api_key: str):
        self.client = Anthropic(api_key=api_key)

    def combine_pdf_pages(self, docs: List) -> str:
        combined_text = ""
        for doc in docs:
            if hasattr(doc, 'page_content'):
                combined_text += doc.page_content + "\n\n"
            elif hasattr(doc, 'content'):
                combined_text += doc.content + "\n\n"
            else:
                combined_text += str(doc) + "\n\n"
        return combined_text.strip()

    def group_docs_by_source(self, all_docs: List) -> Dict[str, List]:
        docs_by_source = {}

        for doc in all_docs:
            # Extract source filename from metadata
            source = "unknown"
            if hasattr(doc, 'metadata') and 'source' in doc.metadata:
                source = os.path.basename(doc.metadata['source'])
            elif hasattr(doc, 'metadata') and 'file_path' in doc.metadata:
                source = os.path.basename(doc.metadata['file_path'])

            if source not in docs_by_source:
                docs_by_source[source] = []
            docs_by_source[source].append(doc)

        return docs_by_source

    def split_into_sections(self, document_text: str) -> List[Dict]:
        chunks = []

        # Strategy 1: Look for common bank document patterns
        if "transcript" in document_text.lower() or "earnings" in document_text.lower():
            return self.split_earnings_transcript(document_text)
        elif "summary" in document_text.lower() and "risk" in document_text.lower():
            return self.split_risk_summary(document_text)
        else:
            return self.split_generic_document(document_text)

    def split_earnings_transcript(self, text: str) -> List[Dict]:
        chunks = []

        # Summary section
        summary_match = re.search(r'^(.*?)(?=Full results|--- Running|Transcript \d+)', text, re.DOTALL)
        if summary_match:
            chunks.append({
                'content': summary_match.group(1).strip(),
                'section_type': 'summary'
            })

        # Transcript sections
        transcript_sections = re.findall(
            r'(?:Transcript|Analyzing Transcript) (\d+).*?---(.*?)(?=(?:Transcript|Analyzing Transcript) \d+|\-{20,}|All Topics|$)',
            text, re.DOTALL
        )

        for transcript_num, content in transcript_sections:
            chunks.append({
                'content': content.strip(),
                'section_type': 'transcript_analysis',
                'transcript_num': transcript_num
            })

        # Consolidated section
        consolidated_match = re.search(r'(All Topics.*?)$', text, re.DOTALL)
        if consolidated_match:
            chunks.append({
                'content': consolidated_match.group(1).strip(),
                'section_type': 'consolidated'
            })

        return chunks

    def split_risk_summary(self, text: str) -> List[Dict]:
        chunks = []

        # Split by major headings
        sections = re.split(r'\n(?=#{1,3}\s+[A-Z])', text)

        for i, section in enumerate(sections):
            if section.strip():
                # Determine section type from heading
                section_type = "risk_section"
                if re.search(r'summary|overview', section[:100], re.IGNORECASE):
                    section_type = "summary"
                elif re.search(r'conclusion|recommendation', section[:100], re.IGNORECASE):
                    section_type = "conclusion"

                chunks.append({
                    'content': section.strip(),
                    'section_type': section_type,
                    'section_num': i + 1
                })

        return chunks

    def split_generic_document(self, text: str) -> List[Dict]:
        # Split by double line breaks or major headings
        sections = re.split(r'\n\s*\n\s*(?=[A-Z])', text)

        chunks = []
        for i, section in enumerate(sections):
            if len(section.strip()) > 100:  # Only include substantial sections
                chunks.append({
                    'content': section.strip(),
                    'section_type': 'section',
                    'section_num': i + 1
                })

        return chunks

    def get_analysis_prompt(self, chunk_content: str) -> str:
        return f"""
        Analyze this banking/financial document section and extract:

        1. Risk categories (credit, operational, liquidity, market, reputational, compliance)
        2. Key topics and themes
        3. Risk severity if mentioned
        4. Important metrics, ratios, or numbers
        5. Main concerns or issues
        6. Regulatory mentions

        Return as JSON:
        {{
            "risk_categories": ["category1", "category2"],
            "topics": ["topic1", "topic2"],
            "risk_level": "high/medium/low or null",
            "main_concerns": ["concern1", "concern2"],
            "metrics": ["metric1: value", "metric2: value"],
            "regulatory_mentions": ["regulation1", "regulation2"],
            "summary": "Brief summary of this section"
        }}

        Document section:
        {chunk_content[:3000]}
        """

    def analyze_chunk_with_claude(self, chunk: Dict, doc_name: str) -> Dict:
        prompt = self.get_analysis_prompt(chunk['content'])

        try:
            response = self.client.messages.create(
                model="claude-3-5-sonnet-20241022",
                max_tokens=1500,
                temperature=0,
                messages=[{"role": "user", "content": prompt}]
            )

            response_text = response.content[0].text
            json_match = re.search(r'\{.*\}', response_text, re.DOTALL)

            if json_match:
                metadata = json.loads(json_match.group())
                chunk['claude_analysis'] = metadata
            else:
                chunk['claude_analysis'] = {"error": "Could not parse JSON", "raw_response": response_text}

        except Exception as e:
            print(f"Claude analysis failed for chunk in {doc_name}: {e}")
            chunk['claude_analysis'] = {"error": str(e)}

        return chunk

    def process_single_document(self, document_text: str, doc_name: str, doc_id: str) -> List[Dict]:
        print(f"Processing document: {doc_name}")

        # Split into sections
        sections = self.split_into_sections(document_text)
        print(f"  Found {len(sections)} sections")

        # Create chunks with IDs
        chunks = []
        for i, section in enumerate(sections):
            chunk_id = f"{doc_id}_section_{i+1}"
            if 'transcript_num' in section:
                chunk_id = f"{doc_id}_transcript_{section['transcript_num']}"
            elif section['section_type'] == 'summary':
                chunk_id = f"{doc_id}_summary"
            elif section['section_type'] == 'consolidated':
                chunk_id = f"{doc_id}_consolidated"

            chunk = {
                'chunk_id': chunk_id,
                'doc_name': doc_name,
                'doc_id': doc_id,
                'content': section['content'],
                'section_type': section['section_type'],
                'char_count': len(section['content'])
            }

            # Add any additional metadata
            for key, value in section.items():
                if key not in ['content', 'section_type']:
                    chunk[key] = value

            chunks.append(chunk)

        # Analyze each chunk with Claude
        print(f"  Analyzing {len(chunks)} chunks with Claude...")
        for i, chunk in enumerate(chunks):
            print(f"    Chunk {i+1}/{len(chunks)}: {chunk['chunk_id']}")
            chunks[i] = self.analyze_chunk_with_claude(chunk, doc_name)

        return chunks

def process_all_documents(all_docs: List, api_key: str, output_dir: str = "chunked_outputs"):
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)

    # Initialize chunker
    chunker = ClaudePDFChunker(api_key)

    # Group documents by source file
    docs_by_source = chunker.group_docs_by_source(all_docs)

    print(f"Found {len(docs_by_source)} unique documents to process")
    print("Documents found:")
    for source, pages in docs_by_source.items():
        print(f"  - {source}: {len(pages)} pages")
    print()

    all_chunks = []

    # Process each document
    for doc_name, doc_pages in docs_by_source.items():
        # Combine all pages into single text
        document_text = chunker.combine_pdf_pages(doc_pages)

        # Create document ID from filename
        doc_id = re.sub(r'[^a-zA-Z0-9_]', '_', doc_name.replace('.pdf', ''))

        # Process the document
        doc_chunks = chunker.process_single_document(document_text, doc_name, doc_id)

        # Save individual document chunks
        doc_output_file = os.path.join(output_dir, f"{doc_id}_chunks.json")
        with open(doc_output_file, 'w', encoding='utf-8') as f:
            json.dump(doc_chunks, f, indent=2, ensure_ascii=False)

        print(f"  ✓ Saved {len(doc_chunks)} chunks to {doc_output_file}")
        all_chunks.extend(doc_chunks)
        print()

    # Save all chunks together
    all_chunks_file = os.path.join(output_dir, "all_chunks_combined.json")
    with open(all_chunks_file, 'w', encoding='utf-8') as f:
        json.dump(all_chunks, f, indent=2, ensure_ascii=False)

    # Create summary
    summary = {
        "processing_date": datetime.now().isoformat(),
        "total_documents": len(docs_by_source),
        "total_chunks": len(all_chunks),
        "documents_processed": list(docs_by_source.keys()),
        "chunks_per_document": {doc_id: len([c for c in all_chunks if c['doc_id'] == doc_id])
                               for doc_id in set(c['doc_id'] for c in all_chunks)}
    }

    summary_file = os.path.join(output_dir, "processing_summary.json")
    with open(summary_file, 'w', encoding='utf-8') as f:
        json.dump(summary, f, indent=2, ensure_ascii=False)

    print(f"🎉 Processing complete!")
    print(f"  Total documents processed: {len(docs_by_source)}")
    print(f"  Total chunks created: {len(all_chunks)}")
    print(f"  Output directory: {output_dir}")
    print(f"  Combined chunks file: {all_chunks_file}")
    print(f"  Processing summary: {summary_file}")

    return all_chunks

In [None]:
topic_chunks = process_all_documents(topic_modelled_docs, userdata.get('ANTHROPIC_API_KEY'), "topic_modelled_chunks")

Found 4 unique documents to process
Documents found:
  - UBS Topic Modelling Results.pdf: 24 pages
  - JPMorgan Topic Modelling Results.pdf: 24 pages
  - Citi Bank Topic Modelling Results.pdf: 24 pages
  - Silicon Vallet Topic Modelling Results.pdf: 24 pages

Processing document: UBS Topic Modelling Results.pdf
  Found 11 sections
  Analyzing 11 chunks with Claude...
    Chunk 1/11: UBS_Topic_Modelling_Results_summary
    Chunk 2/11: UBS_Topic_Modelling_Results_transcript_1
    Chunk 3/11: UBS_Topic_Modelling_Results_transcript_2
    Chunk 4/11: UBS_Topic_Modelling_Results_transcript_3
    Chunk 5/11: UBS_Topic_Modelling_Results_transcript_4
    Chunk 6/11: UBS_Topic_Modelling_Results_transcript_5
    Chunk 7/11: UBS_Topic_Modelling_Results_transcript_6
    Chunk 8/11: UBS_Topic_Modelling_Results_transcript_7
    Chunk 9/11: UBS_Topic_Modelling_Results_transcript_8
    Chunk 10/11: UBS_Topic_Modelling_Results_transcript_9
    Chunk 11/11: UBS_Topic_Modelling_Results_consolidated
  ✓ Sa

In [None]:
for chunk in topic_chunks:
    print(chunk)

{'chunk_id': 'UBS_Topic_Modelling_Results_summary', 'doc_name': 'UBS Topic Modelling Results.pdf', 'doc_id': 'UBS_Topic_Modelling_Results', 'content': "Summary \n## Supervisory-Level Summary of Earnings Call Transcripts \n \n### Emerging Prudential Risks \n**Summary:** Over the past 2-3 years, the transcripts reveal significant concerns regarding \nemerging prudential risks, particularly in the context of regulatory scrutiny and operational \nresilience. The bank's exposure to low-interest-rate environments, particularly in the context \nof its net interest margin, has been a recurring theme. Additionally, the bank's operational \nrisk framework is under pressure due to increasing reliance on digital transformation and \npotential vulnerabilities in cybersecurity. \n \n**Topic Summaries:** \n1. **Low-Interest Rate Environment** \n   - **Keywords:** Interest rates, margin pressure, earnings impact \n   - **Risk Salience Score:** 4 \n   - **Supervisory Summary:** The bank faces ongoing c

In [None]:
# Add Topic Modelled Docs to the vectorstore
topic_docs_for_vectorstore = []

for chunk in topic_chunks:
        doc = Document(
            page_content=chunk['content'],
            metadata={
                'source': chunk.get('doc_name', ''),
                'chunk_id': chunk.get('chunk_id', ''),
                'section_type': chunk.get('section_type', '')
            }
        )
        topic_docs_for_vectorstore.append(doc)

# Add to existing vectorstore
vectorstore2.add_documents(topic_docs_for_vectorstore)

['d59c36a4-7b6e-4723-b5b8-4f328fdedaaf',
 '53991be0-ecec-46a7-b037-234d946c0b16',
 '79fb3c13-0246-4b77-8599-1942f43b051a',
 '94034b26-d06b-4dc5-a8c8-858e350b09da',
 '2fa73363-d253-4f11-966c-033f95e714d1',
 '92033866-b2c1-4edc-ab39-5f246694e753',
 '3e105037-9aca-4da8-bfea-7aa29c5101c5',
 '6bc4f909-d661-4a9b-9d62-b03a104ccc5c',
 '6d821118-66a7-4f27-994a-6474f8962914',
 '79c2f788-2eef-40a8-b9ac-40b09063ef07',
 'b7098b0e-4b78-4820-80c5-84d0a2cae94a',
 '043b9594-302d-4814-bfe8-1c41f03595cf',
 'fb7ea491-96c1-4539-a658-d1a0d7ac279b',
 '227ebd83-ab85-4ddf-82af-a6a694f0519f',
 '55c40c97-2620-49ae-8f57-8cf68adac836',
 'f13879d9-1db1-48e7-99d8-c91bc4adeeee',
 'c2749407-7c9b-4bc8-9240-952d03822505',
 '89126250-fa0c-4f9d-acc9-7340c254548f',
 'e8bdca50-d762-495c-ac1c-a449145f4646',
 'cf4d1b60-cacf-4723-9736-74a34a06449f',
 'da337108-e2da-463d-ac74-28671e78af95',
 'ae9919de-c39d-4e85-9106-3f53b1ddfb7b',
 '7516581c-1d2f-4b17-925b-08d3e593c7f3',
 '14c2b012-01f4-41e6-823e-f6b263654dc4',
 '5a8bbb3c-ce26-

### ChatBot

In [None]:
# Setup some short questions for use within the chatbot
get_questions_for_chatbot = """
You are a financial risk specialist working with large banks such as JP Morgan and Citi. Your job is to come up with questions about the financial risk of the bank and any things that could concern central banks or regulators such as the Bank of England.
You will provide five short questions to the bank in the form of a list as shown below:
["How has the liquidity of the bank been assessed for the coming year?", "What is the outlook for cash generation?", ...]

Requirements:
- Your output must be a valid list, with each question contained in quotes with no other quotes within the question.
- Ask 1 basic questions about the Basel Principles
- Your output must be no longer than 20 words per question.
- The questions should consider specific financial risk of the banks in question as well as any changes in the regulatory environment.
"""

In [None]:
# Ensure questions are in a format suitable for the Chatbot
questions_for_interface = ast.literal_eval(get_responses(get_questions_for_chatbot, "claude").content)

Question: 
You are a financial risk specialist working with large banks such as JP Morgan and Citi. Your job is to come up with questions about the financial risk of the bank and any things that could concern central banks or regulators such as the Bank of England.
You will provide five short questions to the bank in the form of a list as shown below:
["How has the liquidity of the bank been assessed for the coming year?", "What is the outlook for cash generation?", ...]

Requirements:
- Your output must be a valid list, with each question contained in quotes with no other quotes within the question.
- Ask 1 basic questions about the Basel Principles
- Your output must be no longer than 20 words per question.
- The questions should consider specific financial risk of the banks in question as well as any changes in the regulatory environment.


Claude's Response:
[
"How does the bank plan to meet Basel III capital requirements over the next 12 months?",
"What impact could rising interes

In [None]:
# Enhanced RAG Chatbot with Banking Risk Questions

class BankingRiskChatbot:
    def __init__(self, qa_chain, vectorstore, pre_generated_questions):
        self.qa_chain = qa_chain
        self.vectorstore = vectorstore
        self.pre_generated_questions = pre_generated_questions
        self.conversation_history = []

    def get_response(self, query, chat_history):
        """Process user query and return response with sources"""
        try:
            # Use the QA chain to get response
            result = self.qa_chain.invoke({"query": query})

            # Extract answer and sources
            answer = result.get("result", "I'm sorry, I couldn't find an answer to your question.")
            source_docs = result.get("source_documents", [])

            # Format sources for display
            sources_text = self.format_sources(source_docs)

            # Combine answer with sources
            full_response = f"{answer}\n\n**Sources:**\n{sources_text}"

            # Update chat history
            chat_history.append([query, full_response])

            return chat_history, ""

        except Exception as e:
            error_response = f"Sorry, I encountered an error: {str(e)}"
            chat_history.append([query, error_response])
            return chat_history, ""

    def format_sources(self, source_docs, max_sources=3):
        """Format source documents for display"""
        if not source_docs:
            return "No sources found."

        sources = []
        for i, doc in enumerate(source_docs[:max_sources]):
            source_name = doc.metadata.get('source', 'Unknown source')
            chunk_type = doc.metadata.get('chunk_type', '')
            section_type = doc.metadata.get('section_type', '')

            # Clean up source name
            if source_name.startswith('/tmp/'):
                source_name = source_name.replace('/tmp/', '')

            source_info = f"**{i+1}. {source_name}**"
            if chunk_type:
                source_info += f" ({chunk_type})"
            elif section_type:
                source_info += f" ({section_type})"

            # Add snippet of content
            content_snippet = doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content
            source_info += f"\n*{content_snippet}*"

            sources.append(source_info)

        return "\n\n".join(sources)

# Initialize the chatbot
chatbot = BankingRiskChatbot(qa_chain, vectorstore2, questions_for_interface)

def chat_response(message, history):
    """Gradio chat function"""
    return chatbot.get_response(message, history)

def use_suggested_question(question, history):
    """Function to handle suggested question clicks"""
    return chatbot.get_response(question, history)

# Create Gradio interface
with gr.Blocks(
    title="Banking Risk Analysis Chatbot",
    theme=gr.themes.Soft(),
    css="""
    .suggested-questions {
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
        border-radius: 10px;
        padding: 15px;
        margin: 10px 0;
    }
    .question-btn {
        margin: 3px;
        background: rgba(255,255,255,0.2);
        border: 1px solid rgba(255,255,255,0.3);
        color: black;
        border-radius: 8px;
        padding: 8px 12px;
        transition: all 0.3s ease;
    }
    .question-btn:hover {
        background: rgba(255,255,255,0.3);
        transform: translateY(-2px);
    }
    """
) as demo:

    gr.Markdown(
        """
        # 🏦 Team Stompa - Banking Risk Analysis Chatbot

        Ask questions about banking regulations, risk management, or use the suggested questions below.
        """
    )

    # Message input
    msg = gr.Textbox(
        label="Ask a question about banking risk...",
        placeholder="e.g., What is Principle 25 about operational risk?",
        lines=2,
        max_lines=5
    )

    # Chat interface
    chatbot_interface = gr.Chatbot(
        label="Banking Risk Assistant",
        height=400,
        show_label=True,
        show_copy_button=True,
        bubble_full_width=False
    )

    # Suggested questions section
    with gr.Group(elem_classes="suggested-questions"):
        gr.Markdown("### 💡 Suggested Questions")
        gr.Markdown("Click any question below to get started:")

        # Create buttons for each pre-generated question
        question_buttons = []
        for i, question in enumerate(questions_for_interface):
            btn = gr.Button(
                question,
                elem_classes="question-btn",
                size="sm"
            )
            question_buttons.append(btn)

    # Control buttons
    with gr.Row():
        clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary")
        submit_btn = gr.Button("📤 Send", variant="primary")

    # Event handlers
    def respond(message, chat_history):
        if message.strip():
            return chatbot.get_response(message, chat_history)
        return chat_history, ""

    def clear_chat():
        chatbot.conversation_history = []
        return [], ""

    # Wire up the interface
    msg.submit(respond, [msg, chatbot_interface], [chatbot_interface, msg])
    submit_btn.click(respond, [msg, chatbot_interface], [chatbot_interface, msg])
    clear_btn.click(clear_chat, [], [chatbot_interface, msg])

    # Wire up suggested question buttons
    for btn in question_buttons:
        btn.click(
            lambda q=btn.value: use_suggested_question(q, []),
            [],
            [chatbot_interface, msg]
        )


    # Footer
    gr.Markdown(
        """
        ---
        **Data Sources:** Basel Principles, Banking Transcripts, Topic Modeling Results
        **Powered by:** Claude 3.5 Sonnet, FAISS Vector Search, LangChain RAG
        """
    )

  chatbot_interface = gr.Chatbot(
  chatbot_interface = gr.Chatbot(


In [None]:
# Launch with public sharing for Colab
demo.launch(
    share=True,  # Creates public URL for sharing
    debug=True,
    server_name="0.0.0.0",
    server_port=7860,
    show_error=True
)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://7726f95db0d052b869.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
