In [9]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

# Step 1: Load PDF
loader = PyMuPDFLoader("data/Huy_Bui_Resume.pdf")  # Replace with your PDF path
documents = loader.load()
print("Number of words:", len(documents[0].page_content.split(" ")))

# Step 2: Split into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.split_documents(documents)

# Step 3: Use HuggingFace Embeddings (open source)
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False} #False Euclidean, True cosine similarity
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# Step 4: Create FAISS vector index
vector_store = FAISS.from_documents(chunks, hf)

# Step 5: Save the index locally
vector_store.save_local("faiss_index_open")
print("Indexing complete and saved to 'faiss_index_open'")


Number of words: 360


  from .autonotebook import tqdm as notebook_tqdm


ImportError: Could not import faiss python package. Please install it with `pip install faiss-gpu` (for CUDA supported GPU) or `pip install faiss-cpu` (depending on Python version).

In [8]:
pip install pymupdf

Defaulting to user installation because normal site-packages is not writeable
Collecting pymupdf
  Using cached pymupdf-1.25.5-cp39-abi3-macosx_11_0_arm64.whl (18.6 MB)
Installing collected packages: pymupdf
Successfully installed pymupdf-1.25.5
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [37]:
# Extract raw texts
texts = [chunk.page_content for chunk in chunks[:3]]  # First 3 chunks

# Get embeddings
vectors = hf.embed_documents(texts)

for i, (text, vec) in enumerate(zip(texts, vectors)):
    print(f"\n🔢 Embedding {i+1}:")
    print(f"Text: {text[:100]}...")
    print(f"Vector (first 5 dims): {vec[:5]}")
    print(f"Vector length: {len(vec)}")



🔢 Embedding 1:
Text: Huy Bui
williamhuybui@gmail.com | linkedin.com/in/huy-bui-ds
Experience
Publicis Groupe
Remote
Senio...
Vector (first 5 dims): [-0.00920974463224411, 0.09017596393823624, -0.02930494025349617, -0.06725069880485535, 0.003979883622378111]
Vector length: 768

🔢 Embedding 2:
Text: – Built an automated data pipeline leveraging Python, SQL, BigQuery, AWS Lambda, S3, and EventBridge...
Vector (first 5 dims): [0.034452714025974274, 0.08983542025089264, -0.04970097914338112, -0.06849124282598495, 0.0050932131707668304]
Vector length: 768

🔢 Embedding 3:
Text: Research Data Scientist
April 2020 – June 2022
– Lead author of the research Machine Learning Applic...
Vector (first 5 dims): [-0.06988286226987839, 0.02098994143307209, -0.006211360916495323, -0.03602774068713188, -0.02026708610355854]
Vector length: 768


In [1]:
results = vector_store.similarity_search_with_score(
    "Mentor ?", k=4
)
for res, score in results:
    print(f"* [SIM={score:3f}] {res.page_content} [{res.metadata}]")

NameError: name 'vector_store' is not defined