In [1]:
!pip install -U langchain langchain-community openai faiss-cpu tiktoken pypdf



In [1]:
# Step 0: Import dependencies
import os
from langchain_community.document_loaders import PyPDFLoader 
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI


In [2]:
# Step 1: Load API Key from .env file & load API key
import os
from dotenv import load_dotenv

# Load the contents of the .env file into system environment variables
load_dotenv()

# Retrieve the key from environment variables
api_key = os.getenv("OPENAI_API_KEY")

if not api_key:
    raise ValueError("OPENAI_API_KEY not found in environment variables.")

print("✅ API Key loaded successfully (will not be displayed)")

# Windows-specific: avoid MKL/OpenMP conflicts
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

✅ API Key loaded successfully (will not be displayed)


In [3]:
# Step 2: Select multiple PDFs via system dialog (tkinter)
from tkinter import Tk, filedialog
from langchain_community.document_loaders import PyPDFLoader
import os

# open dialog
root = Tk(); root.withdraw()
pdf_paths = filedialog.askopenfilenames(
    title="Select PDF files",
    filetypes=[("PDF files", "*.pdf")]
)
root.destroy()

pdf_paths = list(pdf_paths)
if not pdf_paths:
    raise SystemExit("No PDF selected. Exiting.")

print("The following files will be loaded:")
for p in pdf_paths:
    print(" -", p)

# load all, keep filename+page metadata
documents = []
for path in pdf_paths:
    docs = PyPDFLoader(path).load()
    for d in docs:
        d.metadata["source"] = os.path.basename(d.metadata.get("source", path))
    documents.extend(docs)
print(f"Total pages loaded: {len(documents)}")



The following files will be loaded:
 - C:/Users/syk_5/main_SS.pdf
 - C:/Users/syk_5/Resume.pdf
Total pages loaded: 34


In [4]:
# Step 3: Split text
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = text_splitter.split_documents(documents)


In [5]:
# Step 4: Generate vector database
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(docs, embeddings)


In [7]:
# Step 5: Build the RAG retrieval-based Q&A system
from langchain_openai import OpenAI  # Updated OpenAI class in the new version
from langchain.chains import RetrievalQA

retriever = vectorstore.as_retriever()
qa = RetrievalQA.from_chain_type(
    llm=OpenAI(temperature=0),
    retriever=retriever
)


In [11]:
# Step 6: Ask questions!
# Interactive loop
while True:
    query = input("Please ask your question（exit when blank）：")
    if query.lower() == "":
        break
    response = qa.invoke(query)
    print("Response：", response["result"])



Please ask your question（exit when blank）： What is the correlation between two graphs?


Response：  The correlation between two graphs is determined by the presence of corresponding edges in the two graphs. If all edges in the first graph are independent of the corresponding edges in the second graph, then the graphs are considered independent. If there is a correlation between the edges, then the graphs are considered correlated. The measure of correlation between graphs is of both theoretical and practical interest, and can be used in applications such as social networks, neural networks, and knowledge graphs. 


Please ask your question（exit when blank）： How many classes are there for Wikipedia data?


Response：  There are six classes for Wikipedia data: people, places, dates, math things, things, and categories.


Please ask your question（exit when blank）： 
