In [42]:
from sentence_transformers import SentenceTransformer
import os
class EmbeddingService:
    def __init__(self, model_name, device):
        self.model = SentenceTransformer(model_name, device=device)
    
    def generate_embedding(self, text):
        return self.model.encode([text])

In [43]:
sentence_transformer_model = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
sentence_transformer_device = 'cpu'
embedding_service = EmbeddingService(sentence_transformer_model, sentence_transformer_device)

In [44]:
pdf_folder = "/Users/yeargun/Downloads/TESTPDFS"

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
)

# Initialize the embedding model
embeddings = HuggingFaceEmbeddings()


In [57]:
from pymilvus import connections, db, IndexType, Milvus, MilvusClient, DataType, Collection
from pymilvus.client.types import MetricType
from pymilvus.milvus_client import IndexParams

db_name = "pdf_db"
collection_name = "pdf_embeddings"
embedding_index = "pdf_embedding_index"

databases = db.list_database()
if db_name not in databases:
    database = db.create_database(db_name)

client = MilvusClient(
    uri="http://localhost:19530",
    db_name=db_name
)

def create_indices(client):
    index_params = IndexParams()

    index_params.add_index(
        field_name="embedding",
        index_type='IVF_FLAT', 
        metric_type='COSINE',
        params={"nlist": 1024},
        index_name=embedding_index
    )


    client.create_index(
        collection_name=collection_name,
        index_params=index_params,
    )

if collection_name in client.list_collections():
    print(f"Collection '{collection_name}' already exists.")
    print(client.list_indexes(collection_name=collection_name))
    if embedding_index not in client.list_indexes(collection_name=collection_name):
        create_indices(client)
        print(f"Indices for '{collection_name}' created successfully.")
else:
    schema = MilvusClient.create_schema(
        auto_id=True,
        enable_dynamic_field=True,
        description="PDF embeddings and metadata",
    )

    schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
    schema.add_field(field_name="embedding", datatype=DataType.FLOAT_VECTOR, dim=768)
    schema.add_field(field_name="content", datatype=DataType.VARCHAR, max_length=65535)
    schema.add_field(field_name="page_number", datatype=DataType.INT64)
    schema.add_field(field_name="file_name", datatype=DataType.VARCHAR, max_length=1024)


    client.create_collection(
        auto_id=True,
        enable_dynamic_field=True,
        collection_name=collection_name,
        schema=schema,
    )
    create_indices(client)

    print(f"Collection '{collection_name}' created successfully.")

print("Script execution completed.")

Collection 'pdf_embeddings' created successfully.
Script execution completed.


In [58]:
def ensure_index(client, collection_name):
    index_params = IndexParams()
    index_params.add_index(
        field_name="embedding",
        index_type='IVF_FLAT',
        metric_type='COSINE',
        params={"nlist": 1024},
        index_name="pdf_embedding_index"
    )
    client.create_index(
        collection_name=collection_name,
        index_params=index_params,
    )
    print(f"Index created for collection '{collection_name}'")

In [71]:
def process_pdfs_and_upload_to_milvus(pdf_directory, client, collection_name):
    sentence_transformer_model = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
    sentence_transformer_device = 'cpu'
    embedding_service = EmbeddingService(sentence_transformer_model, sentence_transformer_device)

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

    if "pdf_embedding_index" not in client.list_indexes(collection_name):
        ensure_index(client, collection_name)


    client.load_collection(collection_name)

    for root, _, files in os.walk(pdf_directory):
        for file in files:
            if file.endswith('.pdf'):
                pdf_path = os.path.join(root, file)
                print(f"Processing {pdf_path}")

                loader = PyPDFLoader(pdf_path)
                documents = loader.load()

                splits = text_splitter.split_documents(documents)

                data = []
                for split in splits:
                    content = split.page_content
                    embedding = embedding_service.generate_embedding(content)
                    page_number = split.metadata.get('page', 0) + 1  # PyPDFLoader uses 0-based indexing
                    file_name = os.path.basename(pdf_path)
                    data.append([embedding.tolist()[0], content, page_number, pdf_path])

                # print(type(data[10][0]), len(data[10][0]))  # Embedding
                # print(type(data[10][1]), len(data[10][1]))  # Content
                # print(type(data[10][2]))  # Page number
                # print(type(data[10][3]), len(data[10][3]))  # File name

                formatted_data = []
                for item in data:
                    formatted_data.append({
                        "embedding": item[0],
                        "content": item[1],
                        "page_number": item[2],
                        "file_name": item[3]
                    })      
               
                try:
                    client.insert(
                        collection_name=collection_name,
                        data=formatted_data
                    )
                except Exception as e:
                    print(f"Error inserting single record: {e}")


    # client.flush(collection_name=collection_name)
    print("All documents processed and uploaded to Milvus.")

In [72]:
pdf_directory = "/Users/yeargun/Downloads/TESTPDFS"
db_name = "pdf_db"
collection_name = "pdf_embeddings"

client = MilvusClient(
    uri="http://localhost:19530",
    db_name=db_name
)

process_pdfs_and_upload_to_milvus(pdf_directory, client, collection_name)

Processing /Users/yeargun/Downloads/TESTPDFS/KONUANLATIMI-TYT-COG-AYDINYAYINLARI-MATEMATIKCOGRAFYA.pdf
Processing /Users/yeargun/Downloads/TESTPDFS/KONUANLATIMI-TYT-FIZ-AYDINYAYINLARI-OPTIK.pdf
All documents processed and uploaded to Milvus.


In [70]:
collection_info = client.describe_collection(collection_name)
print(collection_info)

{'collection_name': 'pdf_embeddings', 'auto_id': True, 'num_shards': 1, 'description': 'PDF embeddings and metadata', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'auto_id': True, 'is_primary': True}, {'field_id': 101, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 768}}, {'field_id': 102, 'name': 'content', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 65535}}, {'field_id': 103, 'name': 'page_number', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}}, {'field_id': 104, 'name': 'file_name', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 1024}}], 'aliases': [], 'collection_id': 450664324001316588, 'consistency_level': 2, 'properties': {}, 'num_partitions': 1, 'enable_dynamic_field': True}
