# Small to Slide

It's very difficult for a LLM to correctly embed a PPT slide based on text alone. Luckily we have multi-modal LLMs!

#### Goal
Ingest PPT slides using GPT 4o's embedding + Create a scheme to retrieve relevant slides

## Outline
- Setup (API keys, Milvus connection, data loading)
- Pre-processing (Embeddings, Schema creation)
- Ingestion
- Retrieval
- Retrieval Evaluation

### Setup

In [1]:
import os
import re
import asyncio
from typing import Iterator, Tuple
from openai import AsyncOpenAI
openai_api_key = os.environ.get("OPENAI_API_KEY")
zilliz_uri = os.environ.get("ZILLIZ_URI")
zilliz_api_key = os.environ.get("ZILLIZ_API_KEY")

### Pre-Processing

In [4]:
# Load text data, put them into a list
text_path = "climate_youth_llamaparse.md"#NVDA md file: './llamaparse.md'
def markdown_to_string(file_path: str) -> str:
    try:
        with open(file_path, 'r') as file:
            markdown_string = file.read()
        return markdown_string
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None
    except IOError:
        print(f"Error reading file: {file_path}")
        return None

markdown_string = markdown_to_string(text_path)
regex = r"\n---\n"#NVDA regex: r"<!---\s*Page\s*\d+\s*--->"
parts = re.split(regex, markdown_string)

# Do just 5 for checking whether this works
# TODO: Delete or Comment this out if you don't want to embed too many things
parts = parts#[:5]

In [5]:
len(parts)

32

In [6]:
# Type alias for readability
Embedding = list[float]

# Get OpenAI client
client = AsyncOpenAI(api_key=openai_api_key)
async def get_embedding(input_text: str) -> Embedding:
    response = await client.embeddings.create(
        input=input_text,
        model="text-embedding-3-small"
    )
    return response.data[0].embedding

In [7]:
# Run embedding concurrently.
# WARNING: There are ~40 pages = ~40 concurrent calls
tasks = [asyncio.create_task(get_embedding(page)) for page in parts]
results: list[Embedding] = await asyncio.gather(*tasks)
text_embedding_pairs: list[Tuple[str, Embedding]] = list(zip(parts, results))

#### PDF to Images

In [6]:
!ls

NVDA-Company-Overview.pdf llamaparse.md             [1m[36moutput_images[m[m
README.md                 milvus-ingest.ipynb       [1m[36mve[m[m


**Warning:**

Installing Poppler can be tedious. Make sure to
- Install
- Add `poppler` to PATH for your machine

In [5]:
from pdf2image import convert_from_path
import os
from tqdm import tqdm


# Specify the path to your PDF file
pdf_path = "NVDA-Company-Overview.pdf"

# Specify the output folder for images
output_folder = "output_images"

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Convert PDF to images
images = convert_from_path(pdf_path)

# Save each page as an image
for i, image in tqdm(enumerate(images)):
    image.save(f"{output_folder}/page_{i+1}.jpg", "JPEG")

42it [00:03, 11.68it/s]


#### Image Embedding

We'll be embedding each powerpoint slide into image embeddings.

In [10]:
import uuid
from tqdm import tqdm

ids = []
doc_names = []
page_nums = []
content = []
embeddings = []
for idx, i in tqdm(enumerate(text_embedding_pairs)):
    doc_string = f"Climate Youth Magazine, page {idx + 1}"#f"NVDA demo PDF, page {idx+1}"
    ids.append(f"{uuid.uuid5(uuid.NAMESPACE_DNS, doc_string)}")
    doc_names.append("Climate Youth Magazine PDF")#doc_names.append("NVDA demo PDF")
    page_nums.append(idx)
    content.append(i[0])
    embeddings.append(i[1])
data = [ids, doc_names, page_nums, content, embeddings]

32it [00:00, 72393.60it/s]


## Ingestion

### Image referencing in Collection

In [13]:
!pip install pymilvus

Collecting pymilvus
  Downloading pymilvus-2.4.5-py3-none-any.whl.metadata (5.6 kB)
Collecting grpcio<=1.63.0,>=1.49.1 (from pymilvus)
  Downloading grpcio-1.63.0-cp311-cp311-macosx_10_9_universal2.whl.metadata (3.2 kB)
Collecting environs<=9.5.0 (from pymilvus)
  Using cached environs-9.5.0-py2.py3-none-any.whl.metadata (14 kB)
Collecting milvus-lite<2.5.0,>=2.4.0 (from pymilvus)
  Downloading milvus_lite-2.4.9-py3-none-macosx_11_0_arm64.whl.metadata (7.7 kB)
Downloading pymilvus-2.4.5-py3-none-any.whl (197 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m197.3/197.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hUsing cached environs-9.5.0-py2.py3-none-any.whl (12 kB)
Downloading grpcio-1.63.0-cp311-cp311-macosx_10_9_universal2.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading milvus_lite-2.4.9-py3-none-macosx_11_0_arm64.whl (19.9 MB)
[

### Create Collection Schema

In [15]:
# Upload to collection
# TODO: Create references to slides in frontend app

# Create schema
from pymilvus import CollectionSchema, FieldSchema, DataType, Collection, connections, utility

# Connect to Zilliz Cloud
connections.connect(
    alias="default",
    uri=zilliz_uri,
    token=zilliz_api_key
)

collection_name = "nvda_demo_collection"

# Check if the collection exists. If it does, drop it.
if utility.has_collection(collection_name):
    print(f"Collection '{collection_name}' exists. Dropping it...")
    # utility.drop_collection(collection_name)
    # print(f"Collection '{collection_name}' has been dropped.")
else:
    print(f"Collection '{collection_name}' does not exist.")

# Schema
uuid = FieldSchema(
  name="uuid",
  dtype=DataType.VARCHAR,
  max_length=256,
  is_primary=True,
)
doc_name = FieldSchema(
  name="doc_name",
  dtype=DataType.VARCHAR,
  max_length=256,
)
page_number = FieldSchema(
  name="page_number",
  dtype=DataType.INT64,
  max_length=32,
)
text_embedding = FieldSchema(
  name="text_embedding",
  dtype=DataType.FLOAT_VECTOR,
  # Number of dimensions of text-embedding-3-small
  dim=1536
)
content = FieldSchema(
  name="content",
  dtype=DataType.VARCHAR,
  max_length=65535
)
schema = CollectionSchema(
  fields=[uuid, doc_name, page_number, content, text_embedding],
  description="NVDA PDF demo collection",
  enable_dynamic_field=True
)

# Create Collection
collection = Collection(
    name=collection_name,
    schema=schema,
    using='default',
    shards_num=2
    )

# Convert data to Milvus Collection format
result = collection.insert(data)

# Create Index
index_params = {
    "metric_type": "L2",  # Distance metric
    "index_type": "IVF_FLAT", # Index type 
    "params": {"nlist": 1024}  # Index-specific parameters
}

collection.create_index(
    field_name="text_embedding",  # Name of vector field
    index_params=index_params
)

# Finish ingestion
collection.flush()
print(f"Collection {collection_name} created.")
print(f"Added {result.insert_count} items into {collection_name}.")


# Connection clean-up
connections.disconnect("default")

Collection 'nvda_demo_collection' exists. Dropping it...
Collection nvda_demo_collection created.
Added 32 items into nvda_demo_collection.
