# Embedding

Demonstrate how to create a vector embedding representation of a collection of source documents.



In [1]:
# pip install sentence_transformers
import dspy
from sentence_transformers import SentenceTransformer

# Load an extremely efficient local model for retrieval
model = SentenceTransformer("sentence-transformers/static-retrieval-mrl-en-v1", device="cpu")

embedder = dspy.Embedder(model.encode)
embeddings = embedder(["hello", "world"], batch_size=1)

assert embeddings.shape == (2, 1024)

modules.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/226 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

0_StaticEmbedding/model.safetensors:   0%|          | 0.00/125M [00:00<?, ?B/s]

In [None]:
# Traverse a directory and read html files - extract text from the html files
import os
from bs4 import BeautifulSoup
def read_html_files(directory):
    texts = []
    for filename in os.listdir(directory):
        if filename.endswith(".html"):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                soup = BeautifulSoup(file, 'html.parser')
                texts.append(soup.get_text())
    return texts


In [4]:
zelda = read_html_files("../PragmatiCQA-sources/The Legend of Zelda")

print(len(zelda), " documents loaded from The Legend of Zelda")

406  documents loaded from The Legend of Zelda


In [None]:
print(zelda[0][:10000])  # Print the first 10000 characters of the first document









      Malon
     






         Artwork
        

         Render
        

         Model
        

         Sprite
        






            OoT
           

            OoS
           







            OoT3D
           







            OoT
           

            OoT3D
           







            OoS
           

            FSA
           

            TMC
           









      Race
     


       Hylian
       Human
      




      Gender
     


       Female
      




      Main appearance(s)
     




         Ocarina of Time
        



         Oracle of Seasons
        


        Four Swords Adventures
       


         The Minish Cap
        






      Other appearance(s)
     



        Ocarina of Time
       
       (Himekawa)
       

         Super Smash Bros. for Nintendo 3DS
        


        Super Smash Bros. Ultimate
       





      Era(s)
     


       Force Era
       Era of the Hero of Time
       Era of Light and Dark
       Sh

In [8]:
max_characters = 10000  # for truncating >99th percentile of documents
topk_docs_to_retrieve = 5  # number of documents to retrieve per search query

corpus = read_html_files("../PragmatiCQA-sources/The Legend of Zelda")
print(f"Loaded {len(corpus)} documents. Will encode them below.")

# embedder = dspy.Embedder('openai/text-embedding-3-small', dimensions=512)
search = dspy.retrievers.Embeddings(embedder=embedder, corpus=corpus, k=topk_docs_to_retrieve)


Loaded 406 documents. Will encode them below.


In [9]:
search("What is the main quest in The Legend of Zelda?")
# This will return the top 5 documents related to the query about the main quest in The Legend of Zelda.
# You can adjust the query to test different aspects of the corpus.


Prediction(
    passages=['\n\n\n\n\n\n\n\n\n       Second Quest\n      \n\n\n\n\n\n\n        The Second Quest from\n        \n         The Wind Waker HD\n        \n\n\n\n\n\n\n      Game(s)\n     \n\n\n\n        The Legend of Zelda\n       \n\n\n\n        The Adventure of Link\n       \n\n\n\n        The Wind Waker\n       \n\n\n\n\n\n      Other media\n     \n\n\n\n        Zelda\n       \n       (Game & Watch)\n      \n\n\n\n\n      Feature(s)\n     \n\n      Increased difficulty\n      Alternate Dungeons\n     \n\n\n\n\n   The\n   \n    Second Quest\n   \n   ,\n   also known as\n   \n    Second Round\n   \n   ,\n   is a recurring mode in\n   \n\n     The Legend of Zelda\n    \n    series\n   \n   .\n   The Second Quest goes unnamed in\n   \n\n     The Adventure of Link\n    \n\n   .\n  \n\n\n\n     Contents\n    \n\n\n\n\n\n       1\n      \n\n       Overview\n      \n\n\n\n\n\n         1.1\n        \n\n\n          The Legend of Zelda\n         \n\n\n\n\n\n\n         1.2\n        \n