### 임베딩 생성

In [None]:
!pip install numpy scikit-learn

In [1]:
from google import genai

client = genai.Client()

result = client.models.embed_content(
        model="gemini-embedding-001",
        contents="What is the meaning of life?")

print(result.embeddings)

[ContentEmbedding(
  values=[
    -0.022374554,
    -0.004560777,
    0.013309286,
    -0.0545072,
    -0.02090443,
    <... 3067 more items ...>,
  ]
)]


In [2]:
from google import genai

client = genai.Client()

result = client.models.embed_content(
        model="gemini-embedding-001",
        contents= [
            "What is the meaning of life?",
            "What is the purpose of existence?",
            "How do I bake a cake?"
        ])

for embedding in result.embeddings:
    print(embedding)

values=[-0.022374554, -0.004560777, 0.013309286, -0.0545072, -0.02090443, 0.012355714, 0.015772128, 0.0054723006, 0.031729158, 0.0058553913, 0.027073925, -0.0045324513, -0.01544016, 0.031618375, 0.121548004, 0.01925409, 0.0008599909, 0.0061733276, -0.009662611, -0.015545654, 0.017062597, -0.008637558, -0.017125048, 0.0077396077, -0.0153139075, 0.011430326, 0.020329107, -0.00451, 0.024133444, 0.0070407446, 0.020197608, 0.0015623937, -0.008911156, 0.028138846, -0.017435355, -0.012656962, 0.009481721, -0.016410641, -0.015019126, 0.0144167375, -0.023614116, -0.010397569, -0.0024164703, -0.019404082, 0.019276941, -0.011112846, 0.014422737, -0.042639293, -0.014391114, 0.008184219, -0.012202394, 0.012318022, -0.010061107, -0.15826157, 0.015428178, 0.01053043, -0.0069031497, -0.010121202, -0.025695775, -0.028000489, -0.0070861652, -0.014084083, -0.008487853, -0.022122845, 0.0084621115, -0.008990799, -0.020783192, 0.010838266, 0.0013916494, 0.011982546, -0.015935048, 0.015096186, -0.0058905873,

In [6]:
len(result.embeddings[0].values)

3072

다음 예시에서는 SEMANTIC_SIMILARITY를 사용하여 텍스트 문자열의 의미가 얼마나 유사한지 확인하는 방법을 보여줍니다.



In [10]:
from google import genai
from google.genai import types
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

client = genai.Client()

texts = [
    "What is the meaning of life?",
    "What is the purpose of existence?",
    "How do I bake a cake?"]

result = [
    np.array(e.values) for e in client.models.embed_content(
        model="gemini-embedding-001",
        contents=texts,
        config=types.EmbedContentConfig(task_type="SEMANTIC_SIMILARITY")).embeddings
]

# Calculate cosine similarity. Higher scores = greater semantic similarity.

embeddings_matrix = np.array(result)
similarity_matrix = cosine_similarity(embeddings_matrix)

for i, text1 in enumerate(texts):
    for j in range(i + 1, len(texts)):
        text2 = texts[j]
        similarity = similarity_matrix[i, j]
        print(f"Similarity between '{text1}' and '{text2}': {similarity:.4f}")

Similarity between 'What is the meaning of life?' and 'What is the purpose of existence?': 0.9417
Similarity between 'What is the meaning of life?' and 'How do I bake a cake?': 0.7676
Similarity between 'What is the purpose of existence?' and 'How do I bake a cake?': 0.7471


### 임베딩 벡터 크기 제어

In [11]:
from google import genai
from google.genai import types

client = genai.Client()

result = client.models.embed_content(
    model="gemini-embedding-001",
    contents="What is the meaning of life?",
    config=types.EmbedContentConfig(output_dimensionality=768)
)

[embedding_obj] = result.embeddings
embedding_length = len(embedding_obj.values)

print(f"Length of embedding: {embedding_length}")

Length of embedding: 768


3072 차원 임베딩은 정규화됩니다. 정규화된 임베딩은 크기가 아닌 벡터 방향을 비교하여 더 정확한 의미 유사성을 생성합니다. 768, 1536을 비롯한 다른 차원의 경우 다음과 같이 임베딩을 정규화해야 합니다.

In [12]:
import numpy as np
from numpy.linalg import norm

embedding_values_np = np.array(embedding_obj.values)
normed_embedding = embedding_values_np / np.linalg.norm(embedding_values_np)

print(f"Normed embedding length: {len(normed_embedding)}")
print(f"Norm of normed embedding: {np.linalg.norm(normed_embedding):.6f}") # Should be very close to 1

Normed embedding length: 768
Norm of normed embedding: 1.000000
