<a href="https://colab.research.google.com/github/zamanmiraz/NIW-NP-RAG/blob/main/NIW_NP_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! git clone https://github.com/zamanmiraz/NIW-NP-RAG.git
%cd NIW-NP-RAG
! pip install -r requirements.txt

In [None]:
! pip install --force-reinstall torch==2.3.1 torchvision torchaudio
! pip install --force-reinstall transformers==4.41.2
! pip install --upgrade langchain langchain-community
! pip install --upgrade langchain-experimental
! pip install pypdf
! pip install -qU langchain-huggingface
! pip install faiss-cpu
! pip install -qU "langchain[google-genai]"

In [None]:
from rich.console import Console
from rich.style import Style
import pathlib
from rich_theme_manager import Theme, ThemeManager

THEMES = [
    Theme(
        name="dark",
        description="Dark mode theme",
        tags=["dark"],
        styles={
            "repr.own": Style(color="#e87d3e", bold=True),      # Class names
            "repr.tag_name": "dim cyan",                        # Adjust tag names
            "repr.call": "bright_yellow",                       # Function calls and other symbols
            "repr.str": "bright_green",                         # String representation
            "repr.number": "bright_red",                        # Numbers
            "repr.none": "dim white",                           # None
            "repr.attrib_name": Style(color="#e87d3e", bold=True),    # Attribute names
            "repr.attrib_value": "bright_blue",                 # Attribute values
            "default": "bright_white on black"                  # Default text and background
        },
    ),
    Theme(
        name="light",
        description="Light mode theme",
        styles={
            "repr.own": Style(color="#22863a", bold=True),          # Class names
            "repr.tag_name": Style(color="#00bfff", bold=True),     # Adjust tag names
            "repr.call": Style(color="#ffff00", bold=True),         # Function calls and other symbols
            "repr.str": Style(color="#008080", bold=True),          # String representation
            "repr.number": Style(color="#ff6347", bold=True),       # Numbers
            "repr.none": Style(color="#808080", bold=True),         # None
            "repr.attrib_name": Style(color="#ffff00", bold=True),  # Attribute names
            "repr.attrib_value": Style(color="#008080", bold=True), # Attribute values
            "default": Style(color="#000000", bgcolor="#ffffff"),   # Default text and background
        },
    ),
]

theme_dir = pathlib.Path("themes").expanduser()
theme_dir.expanduser().mkdir(parents=True, exist_ok=True)

theme_manager = ThemeManager(theme_dir=theme_dir, themes=THEMES)
theme_manager.list_themes()

dark = theme_manager.get("dark")
light = theme_manager.get("light") # Assign the light theme to the variable 'light'
# theme_manager.preview_theme(dark)

console = Console(theme=dark)

In [None]:
import google.generativeai as genai
from google.colab import userdata
from langchain.chat_models import init_chat_model

GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

llm = init_chat_model("gemini-2.5-flash", model_provider="google_genai", google_api_key=GOOGLE_API_KEY)

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
# from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
import glob # Import the glob module
import fitz # Import fitz for read_pdf_to_string

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
chunker = SemanticChunker(model, breakpoint_threshold_type="percentile")

In [None]:
def read_pdf_to_string(path):
    """
    Read a PDF document from the specified path and return its content as a string.

    Args:
        path (str): The file path to the PDF document.

    Returns:
        str: The concatenated text content of all pages in the PDF document.

    The function uses the 'fitz' library (PyMuPDF) to open the PDF document, iterate over each page,
    extract the text content from each page, and append it to a single string.
    """
    # Open the PDF document located at the specified path
    doc = fitz.open(path)
    content = ""
    # Iterate over each page in the document
    for page_num in range(len(doc)):
        # Get the current page
        page = doc[page_num]
        # Extract the text content from the current page and append it to the content string
        content += page.get_text()
    return content

def replace_t_with_space(list_of_documents):
    """
    Replaces all tab characters ('\t') with spaces in the page content of each document

    Args:
        list_of_documents: A list of document objects, each with a 'page_content' attribute.

    Returns:
        The modified list of documents with tab characters replaced by spaces.
    """

    for doc in list_of_documents:
        doc.page_content = doc.page_content.replace('\t', ' ')  # Replace tabs with spaces
    return list_of_documents

def encode_pdf(directory_path, chunk_size=1000, chunk_overlap=200):
    """
    Encodes all PDF files in a directory into a vector store using Hugging Face (sentence transformer) embeddings.

    Args:
        directory_path: The path to the directory containing PDF files.
        chunk_size: The desired size of each text chunk.
        chunk_overlap: The amount of overlap between consecutive chunks.

    Returns:
        A FAISS vector store containing the encoded book content.
    """

    all_documents = [] # List to hold documents from all PDFs
    pdf_files = glob.glob(f"{directory_path}/*.pdf") # Find all PDF files in the directory

    for pdf_file in pdf_files:
        loader = PyPDFLoader(pdf_file)
        documents = loader.load()
        all_documents.extend(documents) # Add documents from the current PDF to the list

    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len
    )
    texts = text_splitter.split_documents(all_documents)
    cleaned_texts = replace_t_with_space(texts)

    # Create embeddings and vector store
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    vectorstore = FAISS.from_documents(cleaned_texts, embeddings)

    return vectorstore

def encode_pdf_semantic(directory_path, chunk_size=1000, chunk_overlap=200):
    """
    Encodes all PDF files in a directory into a vector store using Semantic Chunking and Hugging Face embeddings.

    Args:
        directory_path: The path to the directory containing PDF files.
        chunk_size: The desired size of each text chunk (Note: SemanticChunker uses different logic).
        chunk_overlap: The amount of overlap between consecutive chunks (Note: SemanticChunker uses different logic).

    Returns:
        A FAISS vector store containing the encoded book content.
    """

    all_documents = [] # List to hold documents from all PDFs
    pdf_files = glob.glob(f"{directory_path}/*.pdf") # Find all PDF files in the directory

    for pdf_file in pdf_files:
        loader = PyPDFLoader(pdf_file)
        documents = loader.load()
        all_documents.extend(documents) # Add documents from the current PDF to the list

    # Split documents into chunks using SemanticChunker
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    text_splitter = SemanticChunker(embeddings, breakpoint_threshold_type='percentile', breakpoint_threshold_amount=90)

    texts = text_splitter.split_documents(all_documents)
    cleaned_texts = replace_t_with_space(texts)

    # Create embeddings and vector store
    vectorstore = FAISS.from_documents(cleaned_texts, embeddings)

    return vectorstore

def retrieve_context_per_question(question, chunks_query_retriever):
    """
    Retrieves relevant context and unique URLs for a given question using the chunks query retriever.

    Args:
        question: The question for which to retrieve context and URLs.

    Returns:
        A tuple containing:
        - A string with the concatenated content of relevant documents.
        - A list of unique URLs from the metadata of the relevant documents.
    """

    # Retrieve relevant documents for the given question
    docs = chunks_query_retriever.invoke(question)

    # Concatenate document content
    # context = " ".join(doc.page_content for doc in docs)
    context = [doc.page_content for doc in docs]

    return context

def show_context(context):
  """
  Display the contents of the provided context list.

  Args:
      context (list): A list of context items to be displayed.

  Prints each context item in the list with a heading indicating its position.
  """
  for i, c in enumerate(context):
      print(f"Context {i + 1}:")
      print(c)
      print("\n")

In [None]:
path = "/content/drive/MyDrive/Data/uscis_aao_pdfs"
# load the pdf document
chunks_vector_store = encode_pdf_semantic(path)

In [None]:
chunks_vector_store.

In [None]:
chunks_query_retriever = chunks_vector_store.as_retriever(search_kwargs={"k": 3})

In [None]:
from langchain_core.prompts import ChatPromptTemplate

test_query = "how many petitioner are total?"
context = retrieve_context_per_question(test_query, chunks_query_retriever)
# show_context(context)

# Set up system prompt
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Keep the answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),

])

result = llm.invoke(prompt.format_prompt(context=context, input=test_query).to_messages())
console.print(result)