<a href="https://colab.research.google.com/github/zamanmiraz/NIW-NP-RAG/blob/main/NIW_NP_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! git clone https://github.com/zamanmiraz/NIW-NP-RAG.git
%cd NIW-NP-RAG
! pip install -r requirements.txt

In [None]:
! pip install torchvision torchaudio
! pip install --upgrade langchain langchain-community
! pip install --upgrade langchain-experimental
! pip install pypdf
! pip install -qU langchain-huggingface
! pip install faiss-cpu
! pip install -qU "langchain[google-genai]"

In [None]:
from rich.console import Console
from rich.style import Style
import pathlib
from rich_theme_manager import Theme, ThemeManager

THEMES = [
    Theme(
        name="dark",
        description="Dark mode theme",
        tags=["dark"],
        styles={
            "repr.own": Style(color="#e87d3e", bold=True),     # Class names
            "repr.tag_name": "cyan",                           # Tag names
            "repr.call": "bright_magenta",                     # Function calls
            "repr.str": "bright_green",                        # Strings
            "repr.number": "bright_red",                       # Numbers
            "repr.none": "dim white",                          # None
            "repr.attrib_name": Style(color="#e87d3e", bold=True),
            "repr.attrib_value": "bright_blue",
            "default": "bright_white on black",                # No yellow background
        },
    ),
    Theme(
        name="light",
        description="Light mode theme",
        styles={
            "repr.own": Style(color="#e87d3e", bold=True),
            "repr.tag_name": Style(color="#0077cc", bold=True),
            "repr.call": Style(color="#800080", bold=True),    # Purple instead of yellow
            "repr.str": Style(color="#008080", bold=True),
            "repr.number": Style(color="#ff4500", bold=True),
            "repr.none": Style(color="#808080", bold=True),
            "repr.attrib_name": Style(color="#e87d3e", bold=True),
            "repr.attrib_value": "bright_blue",
            "default": Style(color="#000000", bgcolor="#ffffff"),  # White background, no yellow
        },
    ),
]

theme_dir = pathlib.Path("themes").expanduser()
theme_dir.mkdir(parents=True, exist_ok=True)

theme_manager = ThemeManager(theme_dir=theme_dir, themes=THEMES)
theme_manager.list_themes()

dark = theme_manager.get("dark")
light = theme_manager.get("light")

console = Console(theme=light)


In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
# from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
import glob # Import the glob module
import fitz # Import fitz for read_pdf_to_string
from transformers import pipeline
import torch # Import torch to check for CUDA availability

In [None]:
import google.generativeai as genai
from google.colab import userdata
from langchain.chat_models import init_chat_model

GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

llm = init_chat_model("gemini-2.5-flash", model_provider="google_genai", google_api_key=GOOGLE_API_KEY)

In [None]:
from google import genai
from google.genai import types
from google.colab import userdata

# --- PROMPTS ---
DOCUMENT_CONTEXT_PROMPT = """
<document>
{doc_content}
</document>
"""

CHUNK_CONTEXT_PROMPT = """
Here is the chunk we want to situate within the whole document
<chunk>
{chunk_content}
</chunk>

Please give a short succinct context to situate this chunk within the overall document
for the purposes of improving search retrieval of the chunk.
Answer only with the succinct context and nothing else.
"""
client = genai.Client(api_key=GOOGLE_API_KEY)

In [None]:
pdf_file = "/content/drive/MyDrive/Data/uscis_aao_pdfs/MAR052025_03B5203.pdf"
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
text_splitter = SemanticChunker(embeddings, breakpoint_threshold_type='percentile', breakpoint_threshold_amount=90)
loader = PyPDFLoader(pdf_file)
documents = loader.load()
texts = text_splitter.split_documents(documents)

In [None]:
chunks_vector_store.save_local("/content/drive/MyDrive/Data/chunks_vector_store")

In [None]:


# --- PROMPTS ---
DOCUMENT_CONTEXT_PROMPT = """
<document>
{doc_content}
</document>
"""

CHUNK_CONTEXT_PROMPT = """
Here is the chunk we want to situate within the whole document
<chunk>
{chunk_content}
</chunk>

Please give a short succinct context to situate this chunk within the overall document
for the purposes of improving search retrieval of the chunk.
Answer only with the succinct context and nothing else.
"""

# --- GLOBAL PROMPT TEMPLATE ---
CHUNK_CONTEXT_PROMPT = (
    "Given the following text chunk:\n\n"
    "{chunk_content}\n\n"
    "Describe its broader context in one or two sentences."
)


# --- FUNCTION: situate_context ---

# Load a lightweight summarization model
# Explicitly set the device to 'cuda' if available, otherwise 'cpu'
device = 0 if torch.cuda.is_available() else -1
summarizer = pipeline(
    "summarization",
    model="facebook/bart-large-cnn",   # or "google/pegasus-xsum"
    device=device # Use the determined device
)

def situate_context(doc: str, chunk: str) -> str:
    """
    Safely generate a short context for a chunk using a local summarization model,
    truncating long documents to avoid IndexError.
    """
    # Limit document length (BART can handle up to ~1024 tokens ≈ 4000 chars)
    truncated_doc = doc[:3500]  # safe buffer
    truncated_chunk = chunk[:1500]

    prompt = f"""
Here is the document:
<document>
{truncated_doc}
</document>

Focus on this chunk:
<chunk>
{truncated_chunk}
</chunk>

Briefly describe how this chunk fits into the document (1–2 sentences).
"""

    try:
        summary = summarizer(prompt, max_length=60, min_length=10, do_sample=False)
        return summary[0]["summary_text"].strip()
    except Exception as e:
        print(f"[WARN] Summarization failed for chunk: {e}")
        return "Context unavailable due to summarization limit."


# def situate_context(doc: str, chunk: str) -> str:
#     """
#     Use Gemini to describe the context of a given text chunk.
#     """
#     model = "models/gemini-2.0-flash-001"  # Include version suffix

#     response = client.models.generate_content(
#         model=model,
#         contents=[
#             {
#                 "role": "user",
#                 "parts": [{"text": CHUNK_CONTEXT_PROMPT.format(chunk_content=chunk)}],
#             },
#         ],
#     )

#     return response.text.strip()


# --- FUNCTION: read_pdf_to_string ---
def read_pdf_to_string(path):
    """
    Read a PDF document and return its text content.
    """
    doc = fitz.open(path)
    content = ""
    for page_num in range(len(doc)):
        page = doc[page_num]
        content += page.get_text()
    doc.close()
    return content


# --- FUNCTION: replace_t_with_space ---
def replace_t_with_space(list_of_documents):
    """
    Replace tab characters with spaces in all document page contents.
    """
    for doc in list_of_documents:
        doc.page_content = doc.page_content.replace('\t', ' ')
    return list_of_documents


# --- FUNCTION: encode_pdf ---
def encode_pdf(directory_path, chunk_size=1000, chunk_overlap=200):
    """
    Encodes all PDF files into a FAISS vector store using Hugging Face embeddings.
    """
    all_documents = []
    pdf_files = glob.glob(f"{directory_path}/*.pdf")

    for pdf_file in pdf_files:
        loader = PyPDFLoader(pdf_file)
        documents = loader.load()
        all_documents.extend(documents)

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len
    )
    texts = text_splitter.split_documents(all_documents)
    cleaned_texts = replace_t_with_space(texts)

    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    vectorstore = FAISS.from_documents(cleaned_texts, embeddings)

    return vectorstore


# --- FUNCTION: encode_pdf_semantic ---
def encode_pdf_semantic(directory_path):
    """
    Encodes all PDF files using Semantic Chunking and Hugging Face embeddings.
    Adds contextual summaries for each chunk using Gemini.
    """
    all_documents = []
    pdf_files = glob.glob(f"{directory_path}/*.pdf")

    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    text_splitter = SemanticChunker(
        embeddings,
        breakpoint_threshold_type='percentile',
        breakpoint_threshold_amount=90,
    )
    # using tqdm here
    for pdf_file in pdf_files:
        loader = PyPDFLoader(pdf_file)
        documents = loader.load()
        texts = text_splitter.split_documents(documents)
        pdf_text = read_pdf_to_string(pdf_file)
        # for chunk in texts:
        #     chunk.metadata['context'] = situate_context(
        #         doc=pdf_text,
        #         chunk=chunk.page_content
        #     )

        all_documents.extend(texts)

    cleaned_texts = replace_t_with_space(all_documents)
    vectorstore = FAISS.from_documents(cleaned_texts, embeddings)

    return vectorstore


# --- FUNCTION: retrieve_context_per_question ---
def retrieve_context_per_question(question, chunks_query_retriever):
    """
    Retrieves relevant document chunks for a question.
    """
    docs = chunks_query_retriever.invoke(question)
    context = [doc.page_content for doc in docs]
    urls = list({doc.metadata.get("source") for doc in docs if doc.metadata.get("source")})
    return context, urls


# --- FUNCTION: show_context ---
def show_context(context):
    """
    Print retrieved context chunks.
    """
    for i, c in enumerate(context):
        print(f"Context {i + 1}:")
        print(c)
        print("\n")

In [None]:
path = "/content/drive/MyDrive/Data/uscis_aao_pdfs"
# load the pdf document
chunks_vector_store = encode_pdf_semantic(path)

In [None]:
personal_statement = """

"""

supporting_pos = """


"""

resume = """


"""

In [None]:
from langchain_core.prompts import ChatPromptTemplate

# --- Retrieve Top Context Chunks ---
chunks_query_retriever = chunks_vector_store.as_retriever(search_kwargs={"k": 20})

# # --- Define Personal Statement ---
# personal_statement = """
# I am the petitioner and beneficiary for this I-140 petition. I would like to take this opportunity
# to explain how I intend to continue my work in the United States. My primary focus is on projects
# with the Department of Energy (DoE) and initiatives integrating Web3 infrastructure for secure
# energy data management. Through these efforts, I aim to improve the efficiency, transparency, and
# sustainability of national energy systems.
# """

# --- Define Query with Source Awareness ---
test_query = (
    "Based on the following personal statement and other supporting document, identify specific areas that need improvement "
    "to strengthen my National Interest Waiver (NIW) case. Provide feedback for each NIW prong "
    "and connect your analysis to evidence or examples drawn from the retrieved context.\n\n"
    "Be explicit in citing where each piece of information comes from — for example, mention the "
    "document title, case type, or link if available.\n\n"
    "Personal Statement:\n"
    f"{personal_statement}\n\n"
    "When applicable, include the following format for evidence reference:\n"
    "- (Source: [Document Title or Link])"
    f"{supporting_pos}\n\n"
    f"{resume} \n \n"
)

# --- Retrieve Context ---
context = retrieve_context_per_question(test_query, chunks_query_retriever)
show_context(context)

# --- SYSTEM PROMPT FOR RAG (NIW) ---
system_prompt = (
    "You are an expert immigration Q&A assistant specializing in National Interest Waiver (NIW) petitions. "
    "Your role is to describe what occurred in the retrieved context, focusing only on the information provided. "
    "You must not add external knowledge, interpretation, or personal evaluation.\n\n"

    "Your task is to summarize what the context shows — such as outcomes, reasoning, or findings — "
    "that relate to NIW petitions and their approval or denial patterns. "
    "Do not make predictions, judgments, or offer advice. "
    "Base your response strictly and exclusively on the retrieved context.\n\n"

    "Always specify your sources clearly by mentioning the document title, case identifier, or link "
    "each time you reference contextual evidence.\n\n"

    "Structure your description according to the three NIW prongs, using only details that appear in the context:\n"
    "1️⃣ **Substantial Merit and National Importance** — Describe how the applicant’s field or work was treated "
    "in the context (e.g., what was considered nationally important or lacking merit). Include examples of outcomes if mentioned.\n"
    "2️⃣ **Well-Positioned to Advance the Proposed Endeavor** — Describe what the context reveals about how petitioners "
    "demonstrated their qualifications, achievements, or future plans, and how USCIS or AAO evaluated those aspects.\n"
    "3️⃣ **Beneficial to the United States** — Describe what the retrieved materials say about how granting the waiver "
    "benefits the U.S., or what reasons were given when such benefit was found insufficient.\n\n"

    "⚠️ Important: You are only describing and summarizing what the retrieved context states. "
    "Do not analyze, speculate, or generate new conclusions beyond it. "
    "If a detail is missing, explicitly say that the context does not contain that information.\n\n"

    "--- Retrieved Context ---\n{context}"
)


# --- Compose Final Prompt ---
prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),
])

# --- Generate Response ---
result = llm.invoke(
    prompt.format_prompt(context=context, input=test_query).to_messages()
)
console.print(result)
