# Simple RAG Pipeline for STEM OPT Document
This notebook demonstrates a simple Retrieval-Augmented Generation (RAG) workflow using LangChain, FAISS, and LLMs to answer questions about the STEM OPT extension process. Each section is annotated for clarity.

In [None]:
from bs4 import BeautifulSoup
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.llms.ollama import Ollama
from langchain.chat_models import ChatOpenAI


## Load and Parse the STEM OPT HTML Document
Read the USCIS STEM OPT extension HTML file and extract its content using BeautifulSoup for further processing.

In [None]:

with open("../data/documents/Optional Practical Training Extension for STEM Students (STEM OPT) _ USCIS.html", "r", encoding="utf-8") as f:
    html = f.read()

# Parse HTML and extract text
soup = BeautifulSoup(html, "html.parser")

### Extract Relevant Content
Identify and extract the main content panels from the HTML using their CSS class.

In [None]:
panels = soup.find_all(class_="accordion__panel")

In [None]:
texts = [panel.get_text(separator="\n", strip=True) for panel in panels]
combined_text = "\n\n".join(texts)

### Chunk the Extracted Text
Split the combined text into smaller chunks to fit within the token limits of embedding models and LLMs.

In [None]:
def chunk_text(text, max_tokens=500, separator="\n\n"):
    
    # Split by paragraphs (double newlines)
    paragraphs = text.split(separator)
    print(paragraphs)
    
    chunks = []
    current_chunk = ""

    for para in paragraphs:
        if len(current_chunk) + len(para) < max_tokens:
            current_chunk += para + separator
        else:
            chunks.append(current_chunk.strip())
            current_chunk = para + separator

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

In [None]:
chunks = chunk_text(combined_text, max_tokens=200)  # adjust size as needed
print(f"Created {len(chunks)} chunks.")

### Create Document Objects
Convert each text chunk into a LangChain `Document` object for downstream processing.

In [None]:
documents = [Document(page_content=chunk) for chunk in chunks]

### Embed Documents and Build Vector Store
Generate embeddings for each document chunk and store them in a FAISS vector database for efficient retrieval.

In [None]:
embedding_model = HuggingFaceEmbeddings(model_name="multi-qa-MiniLM-L6-cos-v1")

In [None]:
vectorstore = FAISS.from_documents(documents, embedding_model)

### Retrieve Relevant Chunks for a Query
Set up a retriever from the vector store and use it to fetch the most relevant document chunks for a sample user query.

In [None]:
retriever = vectorstore.as_retriever()
query = "what's the process for applying OPT?"


In [None]:
retrieved_docs = retriever.get_relevant_documents(query)

In [None]:
retrieved_docs

### Prepare Retrieved Content for LLM
Combine the retrieved document chunks into a single context string to be used as input for the language model.

In [None]:
combined_docs_text = "\n\n".join([doc.page_content for doc in retrieved_docs])

In [None]:
combined_docs_text

### Construct the Prompt for the LLM
Create a prompt template that provides the retrieved context and user query to the language model for answer generation.

In [None]:
prompt_template = """
You are an expert assistant. Use the context below to answer the user's question.
Do NOT include any internal thoughts or explanations.

Context:
{documents}

User Question:
{query}

Answer:
"""

In [None]:
prompt = PromptTemplate(
    input_variables=["documents", "query"],
    template=prompt_template
)

### Set Up LLMs and Chains
Instantiate both a local (Ollama) and an online (Together API) language model, and set up LLM chains for answer generation.

In [None]:
llama3 = Ollama(model="llama3")

In [None]:
api_key = '34f5f526391626c1e46bb060671b85eaf8ec355a22fdb8292dc147fe6d4b3df7'

online_llm = ChatOpenAI(
    model="meta-llama/Llama-Vision-Free",
    openai_api_key=api_key,
    openai_api_base="https://api.together.xyz/v1",
    temperature=0
)

### Generate Answers Using LLMs
Run both the online and local LLM chains to generate answers to the user query based on the retrieved context.

In [None]:
online_llm_chain = LLMChain(llm=online_llm, prompt=prompt)

# this took 2 s to generate summary
online_llm_summary = online_llm_chain.run(documents=combined_docs_text, query=query) 

In [None]:
llama3_chain = LLMChain(llm=llama3, prompt=prompt)

#local model-  this took 7m to generate summary
llama3_summary = llama3_chain.run(documents=combined_docs_text, query=query) 

### Output and Compare Results
Display the answers generated by both the online and local LLMs for comparison.

In [None]:
print(online_llm_summary)

In [None]:
print(llama3_summary)