In [1]:
!pip install -U langchain langchain-community openai faiss-cpu tiktoken pypdf



In [2]:
# Step 0: Import dependencies
import os
from langchain_community.document_loaders import PyPDFLoader 
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI


In [3]:
# Step 1: Load API Key from .env file & load API key
import os
from dotenv import load_dotenv

# Load the contents of the .env file into system environment variables
load_dotenv()

# Retrieve the key from environment variables
api_key = os.getenv("OPENAI_API_KEY")

if not api_key:
    raise ValueError("OPENAI_API_KEY not found in environment variables.")

print("✅ API Key loaded successfully (will not be displayed)")

# Windows-specific: avoid MKL/OpenMP conflicts
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

✅ API Key loaded successfully (will not be displayed)


In [4]:
# Step 2: Load PDF file
pdf_path = "Sample.pdf"  # ← Replace with the path to your PDF file
loader = PyPDFLoader(pdf_path)
documents = loader.load()


In [5]:
# Step 3: Split text
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = text_splitter.split_documents(documents)


In [6]:
# Step 4: Generate vector database
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(docs, embeddings)


In [7]:
# Step 5: Build the RAG retrieval-based Q&A system
from langchain_openai import OpenAI  # Updated OpenAI class in the new version
from langchain.chains import RetrievalQA

retriever = vectorstore.as_retriever()
qa = RetrievalQA.from_chain_type(
    llm=OpenAI(temperature=0),
    retriever=retriever
)


In [8]:
# Step 6: Ask a question!
query = "Please answer in English: What are the main research conclusions of this PDF?"
result = qa.invoke(query)
print("Answer:", result)


Answer: {'query': 'Please answer in English: What are the main research conclusions of this PDF?', 'result': ' The main research conclusions of this PDF include the identification of critical regions in the brain related to schizophrenia, the successful classification of MRI scans using KNN, the improvement of community detection accuracy and robustness, the design of efficient algorithms for identifying structural patterns in graphs, the improvement of model accuracy in random dot product graphs, the identification of critical factors for predicting loan repayment abilities, the development of a YouTube advertising strategy resulting in increased views, the development and fitting of dynamic system models for gene expression, the recovery of latent positions from multiple MRI scans, and the use of various techniques for outlier removal, data transformation, and classification in different research projects.'}
