<a href="https://colab.research.google.com/github/yonathanarbel/AI-LAW/blob/main/Class_3_in_Class_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import google.generativeai as genai
import numpy as np

# Configure the API key


genai.configure(api_key=API_KEY)
# Create an embedding model
model = "models/embedding-001"


# Create a list of strings
strings = [
    "King",
    "Queen",
    "Man",
    "Woman"
]



# Generate embeddings for each string
embeddings = []
for string in strings:
    embedding = genai.embed_content(
        model=model,
        content=string,
        task_type="retrieval_document"
    )
    embeddings.append(embedding)

# Print the embeddings and their dimensionality
for i, embedding in enumerate(embeddings):
    print(f"Embedding for string {i + 1}:")
    print(embedding)
    print(f"This embedding has {len(embedding['embedding'])} dimensions.")
    print("\n")

# Print the statement about dimensionality
print(f"Each string has {len(embeddings[0]['embedding'])} dimensions. That's quite long!")

Embedding for string 1:
{'embedding': [0.027178003, -0.01151858, -0.052665964, -0.019043, 0.061530538, 0.03299952, 0.021854196, -0.01234523, 0.0008411711, 0.053781983, 0.0082819825, 0.0071732407, -0.010808535, 0.0264187, -0.006220491, -0.012883676, -0.009553963, 0.022467675, 0.025292933, -0.019500853, 0.009159541, 0.017710712, 0.009621225, -0.0036999409, 0.03064459, -0.007086186, 0.011710306, -0.076416165, -0.042412873, 0.031516936, -0.054975502, -0.011172505, -0.03988725, 0.010230315, 0.004816975, -0.039196856, -0.020544749, 0.004499611, -0.02744559, 0.03938242, 0.020787934, -0.012254388, -0.038643666, -0.012709507, 0.025951192, -0.0074189603, -0.049018882, 0.02350979, -0.0013866344, -0.008177414, 0.03468799, 0.011919891, 0.05164807, -0.047049325, 0.005073815, -0.06884785, 0.036499787, 0.0025605306, -0.03132571, 0.029209478, -0.0074172826, 0.025580775, 0.033685196, 0.008732368, -0.04007014, -0.0699546, -0.032945, 0.00846565, 0.03496435, -0.0027154805, 0.022143358, -0.032210473, 0.0557

In [None]:
import numpy as np
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Assuming 'embeddings' is your list of embeddings from the previous cell
# Extract the embedding vectors
embedding_vectors = [emb['embedding'] for emb in embeddings]

# Convert to numpy array
embedding_array = np.array(embedding_vectors)

# Check the number of samples
n_samples = embedding_array.shape[0]

if n_samples < 4:
    # Use PCA for very small datasets
    pca = PCA(n_components=2)
    embeddings_2d = pca.fit_transform(embedding_array)
    method = "PCA"
else:
    # Use t-SNE with adjusted perplexity
    perplexity = min(30, n_samples - 1)  # Adjust perplexity based on sample size
    tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity)
    embeddings_2d = tsne.fit_transform(embedding_array)
    method = "t-SNE"

# Plot the 2D embeddings
plt.figure(figsize=(10, 8))
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1])

# Label each point
for i, txt in enumerate(strings):
    plt.annotate(txt, (embeddings_2d[i, 0], embeddings_2d[i, 1]))

plt.title(f"2D Representation of Embeddings using {method}")
plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Calculate pairwise distances
distances = np.linalg.norm(embeddings_2d[:, None] - embeddings_2d, axis=2)

# Plot vectors and distances
plt.figure(figsize=(12, 10))

# Draw vectors
for i, (x, y) in enumerate(embeddings_2d):
    plt.quiver(0, 0, x, y, angles='xy', scale_units='xy', scale=1, label=strings[i])

# Draw distances
for i in range(len(embeddings_2d)):
    for j in range(i+1, len(embeddings_2d)):
        x1, y1 = embeddings_2d[i]
        x2, y2 = embeddings_2d[j]
        plt.plot([x1, x2], [y1, y2], 'r--', alpha=0.3)
        midpoint = ((x1+x2)/2, (y1+y2)/2)
        plt.text(midpoint[0], midpoint[1], f"{distances[i,j]:.2f}", fontsize=8)

plt.title("Vectors and Distances between Word Embeddings")
plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")
plt.legend()
plt.axis('equal')
plt.grid(True)
plt.show()

# Print pairwise distances
print("Pairwise Distances:")
for i in range(len(strings)):
    for j in range(i+1, len(strings)):
        print(f"Distance between '{strings[i]}' and '{strings[j]}': {distances[i,j]:.2f}")

# Calculate and print vector operations
print("\nVector Operations:")
king = embedding_vectors[strings.index("King")]
queen = embedding_vectors[strings.index("Queen")]
man = embedding_vectors[strings.index("Man")]
woman = embedding_vectors[strings.index("Woman")]

result = np.array(queen) - np.array(king) + np.array(man)


In [None]:

# Get embeddings for "King", "Man", and "Woman"
king_embedding = embeddings[0]['embedding']
man_embedding = embeddings[2]['embedding']
woman_embedding = embeddings[3]['embedding']

# Perform the arithmetic operation: King - Man + Woman
result_embedding = np.array(king_embedding) - np.array(man_embedding) + np.array(woman_embedding)


def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_a = np.linalg.norm(vec1)
    norm_b = np.linalg.norm(vec2)
    return dot_product / (norm_a * norm_b)

# Compare the result_embedding with the available embeddings and print cosine similarity
for i, embedding in enumerate(embeddings):
    similarity = cosine_similarity(result_embedding, embedding['embedding'])


# Compare the result_embedding with the available embeddings and calculate cosine similarity
similarities = []
for i, embedding in enumerate(embeddings):
    similarity = cosine_similarity(result_embedding, embedding['embedding'])
    similarities.append((strings[i], round(similarity, 3)))

# Sort the results by similarity
sorted_similarities = sorted(similarities, key=lambda x: x[1], reverse=True)

# Print the sorted results
for string, similarity in sorted_similarities:
    print(f"Cosine similarity between result and string {string}: {similarity}")
