In [None]:
# Import document loaders for loading PDF files from directories
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Initialize DirectoryLoader to load all PDF files from the pdf_files directory
directory = DirectoryLoader (
    "../data/pdf_files",
    loader_cls=PyPDFLoader,
    glob='**/*.pdf',
    show_progress=True
)

In [None]:
# Load all PDF documents from the directory
documents = directory.load()

100%|██████████| 3/3 [00:06<00:00,  2.32s/it]


In [None]:
# Display the first document to inspect its structure
documents[0]

Document(metadata={'producer': 'iText 4.2.0 by 1T3XT', 'creator': 'PyPDF', 'creationdate': '2026-01-21T22:02:52-08:00', 'moddate': '2026-01-21T22:02:53-08:00', 'subject': 'ACM Trans. Softw. Eng. Methodol. 0.0', 'title': 'Large Language Models for Constructing and Optimizing Machine Learning Workflows: A Survey', 'source': '..\\data\\pdf_files\\Constructing and Optimizing Machine.pdf', 'total_pages': 45, 'page': 0, 'page_label': '1'}, page_content='. \n. \nLatest updates: h\ue03cps://dl.acm.org/doi/10.1145/3773084\n. \n. \nRESEARCH-ARTICLE\nLarge Language Models for Constructing and Optimizing Machine\nLearning Workflows: A Survey\nYANG GU, Shanghai Jiao Tong University, Shanghai, China\n. \nHENGYU YOU, Shanghai Jiao Tong University, Shanghai, China\n. \nJIAN CAO, Shanghai Jiao Tong University, Shanghai, China\n. \nMURAN YU, Stanford University, Stanford, CA, United States\n. \nHAORAN FAN, Shanghai Jiao Tong University, Shanghai, China\n. \nSHIYOU QIAN, Shanghai Jiao Tong University, Sh

In [None]:
# Extract and display metadata and content preview from the first document
doc = documents[0]

print(f"Source : {doc.metadata.get('source')}")  # File path of the document
print(f"Subject : {doc.metadata.get('subject')}")  # PDF subject metadata
print(f"Total page : {doc.metadata.get('total_pages')}")  # Total pages in PDF
print(f"\nContent preview : {doc.page_content[:500]}")  # First 500 characters

Source : ..\data\pdf_files\Constructing and Optimizing Machine.pdf
Subject : ACM Trans. Softw. Eng. Methodol. 0.0
Total page : 45

Content preview : . 
. 
Latest updates: hps://dl.acm.org/doi/10.1145/3773084
. 
. 
RESEARCH-ARTICLE
Large Language Models for Constructing and Optimizing Machine
Learning Workflows: A Survey
YANG GU, Shanghai Jiao Tong University, Shanghai, China
. 
HENGYU YOU, Shanghai Jiao Tong University, Shanghai, China
. 
JIAN CAO, Shanghai Jiao Tong University, Shanghai, China
. 
MURAN YU, Stanford University, Stanford, CA, United States
. 
HAORAN FAN, Shanghai Jiao Tong University, Shanghai, China
. 
SHIYOU QIAN, Shanghai


In [None]:
# Import required libraries for embeddings generation
import numpy as np
from typing import List, Dict, Any, Tuple
from sentence_transformers import SentenceTransformer

In [None]:
# Define EmbeddingManager class to handle loading model and generating embeddings
class EmbeddingManager:
    def __init__(self, model_name: str = "all-MiniLM-L6-v2") -> None:
        self.model_name = model_name
        self.model = None
        self._load_model()  # Initialize model on instantiation


    def _load_model(self):
        try:
            print(f"Loading Embedding Model : {self.model_name}")
            self.model = SentenceTransformer(self.model_name)  # Downloads model if not cached
            print(f"Embedding Model Loaded Successfully. Embedding Dimension : {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name} : {e}")
            raise


    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generating embedding for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)  # Convert texts to vectors
        print(f"Generated embeddings with shape : {embeddings.shape}")
        return embeddings

# Create EmbeddingManager instance and generate embeddings for all documents
embedding_manager = EmbeddingManager()
embeddings = embedding_manager.generate_embeddings([d.page_content for d in documents])  # Extract text from each doc
embeddings

Loading Embedding Model : all-MiniLM-L6-v2


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 823.56it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Embedding Model Loaded Successfully. Embedding Dimension : 384
Generating embedding for 170 texts...


Batches: 100%|██████████| 6/6 [00:05<00:00,  1.00it/s]

Generated embeddings with shape : (170, 384)





array([[-0.04726338,  0.00032324,  0.0190388 , ...,  0.00690563,
         0.00175409,  0.05870934],
       [-0.04271643, -0.00851609,  0.00653164, ...,  0.03548166,
         0.03997409,  0.03671024],
       [-0.02085486, -0.04865111,  0.01053316, ...,  0.04732303,
         0.08584597, -0.00469031],
       ...,
       [-0.07741836, -0.07637152,  0.0221991 , ...,  0.08771045,
         0.04401476,  0.09021638],
       [-0.13966283,  0.00127   , -0.03442016, ..., -0.01609609,
        -0.00326633,  0.03288275],
       [ 0.02199661, -0.0140572 , -0.01843296, ...,  0.07923552,
         0.07359981,  0.02747879]], shape=(170, 384), dtype=float32)