In [7]:
# function to find the embedding of arbitrary pairs of sentences with openai embedding model

import os
from openai import OpenAI
from dotenv import load_dotenv
import numpy as np

load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def find_embedding(sentence):
    response = client.embeddings.create(input=[sentence], model="text-embedding-3-large")
    return response.data[0].embedding

def find_cosine_similarity(embedding1, embedding2):
    return np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))

def find_euclidean_distance(embedding1, embedding2):
    return np.linalg.norm(embedding1 - embedding2)


In [8]:
# Create pairs of sentences and find the relative vector of sentence1 to sentence2
def create_sentence_pairs():
    pairs = [
        ("king", "queen"),
        ("man", "woman"),
        ("brother", "sister"),
        ("father", "mother"),
        ("boy", "girl"),
        ("apple", "orange"),
        ("computer", "book")
    ]
    return pairs

def find_relative_vector(sentence1, sentence2):
    embedding1 = find_embedding(sentence1)
    embedding2 = find_embedding(sentence2)
    relative_vector = np.array(embedding1) - np.array(embedding2)
    return relative_vector

# Example usage
sentence_pairs = create_sentence_pairs()


In [9]:
# Compare the relative vectors and analyze their relations

# Calculate cosine similarity between relative vectors
def compare_relative_vectors(pairs):
    relative_vectors = []
    for pair in pairs:
        sentence1, sentence2 = pair
        relative_vector = find_relative_vector(sentence1, sentence2)
        relative_vectors.append(relative_vector)
    
    n = len(relative_vectors)
    similarity_matrix = np.zeros((n, n))
    
    for i in range(n):
        for j in range(n):
            similarity = find_cosine_similarity(relative_vectors[i], relative_vectors[j])
            similarity_matrix[i, j] = similarity
    
    return similarity_matrix

# Analyze the relations
similarity_matrix = compare_relative_vectors(sentence_pairs)

print("Cosine similarity matrix of relative vectors:")
print(similarity_matrix)

# Interpret the results
print("\nInterpretation:")
for i, pair1 in enumerate(sentence_pairs):
    for j, pair2 in enumerate(sentence_pairs):
        if i < j:
            similarity = similarity_matrix[i, j]
            print(f"The relation '{pair1[0]}->{pair1[1]}' compared to '{pair2[0]}->{pair2[1]}' has a cosine similarity of {similarity:.4f}")

# Additional analysis
# print("\nAdditional observations:")
# print("1. High similarity values indicate that the gender relationships are consistent across different word pairs.")
# print("2. Lower similarity values might suggest nuances in gender relationships for different contexts or word types.")
# print("3. The diagonal of the matrix (if printed) would show 1.0, as each vector is identical to itself.")
# print("4. Asymmetry in the matrix could indicate directional differences in gender relationships.")

Cosine similarity matrix of relative vectors:
[[ 1.          0.25769935  0.2129429   0.2374572   0.12147884  0.01944632
  -0.02003091]
 [ 0.25769935  1.          0.32304643  0.29285543  0.36008459 -0.03002277
  -0.03896912]
 [ 0.2129429   0.32304643  1.          0.2413127   0.30908425 -0.00563698
   0.01414469]
 [ 0.2374572   0.29285543  0.2413127   1.          0.20599248  0.01674988
  -0.02066101]
 [ 0.12147884  0.36008459  0.30908425  0.20599248  1.          0.0134008
  -0.06684854]
 [ 0.01944632 -0.03002277 -0.00563698  0.01674988  0.0134008   1.
   0.06793732]
 [-0.02003091 -0.03896912  0.01414469 -0.02066101 -0.06684854  0.06793732
   1.        ]]

Interpretation:
The relation 'king->queen' compared to 'man->woman' has a cosine similarity of 0.2577
The relation 'king->queen' compared to 'brother->sister' has a cosine similarity of 0.2129
The relation 'king->queen' compared to 'father->mother' has a cosine similarity of 0.2375
The relation 'king->queen' compared to 'boy->girl' has 