# Extract Embedding

## OpenAI API

In [1]:
%pip install -qU langchain-openai

Note: you may need to restart the kernel to use updated packages.


In [2]:
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings

load_dotenv()

True

### Text Embedding

In [3]:
text = (
    "Avengers: Endgame is a 2019 American superhero film based on the Marvel Comics superhero team the Avengers."
)

In [4]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

query_result = embeddings.embed_query(text)

In [5]:
print(query_result[:5], len(query_result))

[-0.07305790808685256, -0.0026802903766939366, -0.012657153051423061, 0.005067741723606172, -0.0007931056668536663] 3072


### Text Embedding with smaller dimension

In [6]:
embeddings_512 = OpenAIEmbeddings(model="text-embedding-3-large", dimensions=512)

query_result_512 = embeddings_512.embed_query(text)

In [7]:
print(query_result_512[:5], len(query_result_512))

[-0.13733914196779293, -0.005058522692183418, -0.023718511171539485, 0.009521025957888597, -0.0015015101200422743] 512


### Document Embedding

In [8]:
doc = ["Avengers: Endgame is a 2019 American superhero film based on the Marvel Comics superhero team the Avengers.", "Produced by Marvel Studios and distributed by Walt Disney Studios Motion Pictures, it is the direct sequel to Avengers: Infinity War (2018) and the 22nd film in the Marvel Cinematic Universe (MCU)."]

In [9]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

doc_result = embeddings.embed_documents(doc)

In [10]:
doc_result[0][:5]

[-0.07305790808685256,
 -0.0026802903766939366,
 -0.012657153051423061,
 0.005067741723606172,
 -0.0007931056668536663]

## LLaMA Embedding

### Huggingface

In [4]:
import torch
from transformers import AutoModel, AutoTokenizer

model_name = "meta-llama/Meta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def get_text_embedding(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    hidden_states = outputs.last_hidden_state
    embeddings = hidden_states.mean(dim=1)
    return embeddings.squeeze().numpy()

text = "Avengers: Endgame is a 2019 American superhero film based on the Marvel Comics superhero team the Avengers."
embedding = get_text_embedding(text, model, tokenizer)
print(len(embedding))
print(embedding[:5])

### LLaMAIndex(BERT)

In [None]:
%pip install llama-index-embeddings-huggingface
%pip install llama-index-embeddings-instructor

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

embeddings = embed_model.get_text_embedding("Hello World!")
print(len(embeddings))
print(embeddings[:5])

### LLaMAIndex(llamacpp)

In [None]:
# For Mac(apple silicon) users

!CMAKE_ARGS="-DLLAMA_METAL=on" pip install llama-cpp-python
!pip install llama-cpp-python \
  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/metal