<a href="https://colab.research.google.com/github/vandana10/rag-app/blob/main/rag_app.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from dotenv import load_dotenv
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.chains import RetrievalQA
from PyPDF2 import PdfReader
from dotenv import load_dotenv
from google.colab import files


def load_file(file_path: str) -> str:
    text = ""
    ext = os.path.splitext(file_path)[-1].lower()

    if ext == ".txt":
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()

    elif ext == ".pdf":
        reader = PdfReader(file_path)
        for page in reader.pages:
            text += page.extract_text() or ""  # extract text page by page

    else:
        raise ValueError(f"Unsupported file type: {ext}")

    return text

# --------- RAG pipeline ----------
def build_qa(file_path: str, api_key: str):
    # Load the document
    docs = load_file(file_path)

    # Split into chunks
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunks = splitter.split_text(docs)

    # Create embeddings
    embeddings = OpenAIEmbeddings(openai_api_key=api_key)

    # Store in FAISS
    vectorstore = FAISS.from_texts(chunks, embedding=embeddings)

    # Setup LLM
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, openai_api_key=api_key)

    # Create RetrievalQA chain
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        return_source_documents=True
    )
    return qa_chain

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
uploaded = files.upload()
file_path = "/content/TestFile.txt"
qa = build_qa(file_path, api_key)
query = "Summarize the document in 3 bullet points."
response = qa.invoke({"query": query})
print("Answer:", response["result"])
print("\nSource chunks used:")
for doc in response["source_documents"]:
      print("-", doc.page_content[:200], "...")