# Embedding

Demonstrate how to create a vector embedding representation of a collection of source documents.



In [1]:
# pip install sentence_transformers
import dspy
from sentence_transformers import SentenceTransformer

# Load an extremely efficient local model for retrieval
model = SentenceTransformer("sentence-transformers/static-retrieval-mrl-en-v1", device="cpu")

embedder = dspy.Embedder(model.encode)
embeddings = embedder(["hello", "world"], batch_size=1)

assert embeddings.shape == (2, 1024)

modules.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/226 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

0_StaticEmbedding/model.safetensors:   0%|          | 0.00/125M [00:00<?, ?B/s]

In [4]:
# Traverse a directory and read html files - extract text from the html files
import os
from bs4 import BeautifulSoup
def read_html_files(directory):
    texts = []
    for filename in os.listdir(directory):
        if filename.endswith(".html"):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                soup = BeautifulSoup(file, 'html.parser')
                texts.append(soup.get_text())
    return texts


In [5]:
zelda = read_html_files("../PragmatiCQA-sources/The Legend of Zelda")

print(len(zelda), " documents loaded from The Legend of Zelda")

406  documents loaded from The Legend of Zelda


In [8]:
print(zelda[0][:10000])  # Print the first 10000 characters of the first document










       Non-Canon Information
      








      The Legend of Zelda: The Minish Cap (Himekawa)
     



      The cover of The Minish Cap manga
     



      Author(s)
     


       Akira Himekawa
      




      Illustrator(s)
     


       Akira Himekawa
      




      Publisher(s)
     



      Release date(s)
     


       December 1, 2009
       January 2006
       April 28, 2010
       
        Legendary Edition:
       
       May 9, 2017
       2017
       June 2018
      




      ISBN
     


       ISBN-10:
       ISBN-10:
       ISBN-13:
       ISBN-13:
       ISBN-13:
      






    The Minish Cap Manga
   
   is a
   manga
   by
   Akira Himekawa
   loosely based on
   

     The Minish Cap
    

   . It was released in Japan in January 2006 and made available in English through VIZ Media on December 1, 2009.
  

   A Legendary Edition also containing the
   
    Phantom Hourglass
   
   manga
   by
   Akira Himekawa
   was released in North America 

In [11]:
max_characters = 10000  # for truncating >99th percentile of documents
topk_docs_to_retrieve = 5  # number of documents to retrieve per search query

corpus = read_html_files("../PragmatiCQA-sources/The Matrix")
print(f"Loaded {len(corpus)} documents. Will encode them below.")

# embedder = dspy.Embedder('openai/text-embedding-3-small', dimensions=512)
search = dspy.retrievers.Embeddings(embedder=embedder, corpus=corpus, k=topk_docs_to_retrieve)


Loaded 487 documents. Will encode them below.


In [12]:
search("Whats the name of main character?")
# This will return the top 5 documents related to the query about the main quest in The Legend of Zelda.
# You can adjust the query to test different aspects of the corpus.


Prediction(
    passages=['\n\n\n\n\n\n\n\n\n\n        “\n       \n\n\n\n\n         Billions of people living out their lives...oblivious\n        \n\n         ”\n        \n\n\n\n\n\n\n        ―\n        \n         Agent Smith\n        \n\n\n\n\n\n\n\n   The\n   \n    Mega City\n   \n   (called the\n   \n    Metro World\n   \n   by the\n   \n    redpills\n   \n   who have studied its architecture) is the name of the large, sprawling metropolis in the\n   \n    Matrix\n   \n   where most of the events of\n   \n\n     The Matrix\n    \n    franchise\n   \n   take place. It is the home of many\n   \n    programs\n   \n   , redpills and\n   \n    bluepills\n   \n   .\n  \n\n\n\n     Contents\n    \n\n\n\n\n\n       1\n      \n\n       Purpose\n      \n\n\n\n\n\n       2\n      \n\n       Features\n      \n\n\n\n\n\n       3\n      \n\n       Notes\n      \n\n\n\n\n\n       4\n      \n\n       Behind the Scenes\n      \n\n\n\n\n\n\n    Purpose\n   \n\n\n   Mega City was created by the\n   \