In [None]:
from dotenv import load_dotenv
load_dotenv()

from custom_gmail_reader import CustomGmailReader

# Instantiate the CustomGmailReader
loader = CustomGmailReader(
    query="",
    max_results=50,
    results_per_page=10,
    service=None
)

# Load the emails
documents = loader.load_data()

# Print email information
print(f"Number of documents: {len(documents)}")
for i, doc in enumerate(documents[:20]):
    print(f"Document {i+1}:")
    print(f"To: {doc.metadata.get('to', 'N/A')}")
    print(f"From: {doc.metadata.get('from', 'N/A')}")
    print(f"Subject: {doc.metadata.get('subject', 'N/A')}")
    print(f"Date: {doc.metadata.get('date', 'N/A')}")
    print(f"Content snippet: {doc.text[:1000]}...")
    print("=" * 50)

In [3]:
from llama_index.core import VectorStoreIndex
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import VectorIndexRetriever

from llama_index.core import Settings
from llama_index.core.callbacks import CallbackManager
from langfuse.llama_index import LlamaIndexCallbackHandler

load_dotenv()

# Add tracing
langfuse_callback_handler = LlamaIndexCallbackHandler()
Settings.callback_manager = CallbackManager([langfuse_callback_handler])

# Create index
index = VectorStoreIndex.from_documents(documents)

# Create retriever
retriever = VectorIndexRetriever(index=index)

# Create query engine
query_engine = RetrieverQueryEngine(retriever=retriever)

# Example query
response = query_engine.query("What paid interviews have I gotten invites for?")
print(response)

You have received invites for paid interviews related to two different studies.


In [5]:
from llama_index.embeddings.openai import OpenAIEmbedding
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Initialize the OpenAI embedding model
embedding_model = OpenAIEmbedding(model="text-embedding-ada-002")

# Define the trivia questions and matching answers
phrases = [
    "Who was the first president of the United States?",
    "What is the capital city of France?",
    "In what year did humans first land on the moon?",
    "Which element on the periodic table has the chemical symbol O?",
    "What is the largest planet in the solar system?",
    "The first president of the United States was George Washington.",
    "The capital city of France is Paris.",
    "Humans first landed on the moon in the year 1969.",
    "The chemical symbol O represents the element Oxygen.",
    "The largest planet in the solar system is Jupiter."
]

# Generate embeddings for each phrase using OpenAI embeddings
embeddings = embedding_model.get_text_embedding_batch(phrases)

# Convert embeddings to a numpy array
embeddings_array = np.array(embeddings)

# Print the first phrase and the first several elements of its embedding
print(f"Phrase: {phrases[0]}")
print(f"First 5 elements of its embedding: {embeddings_array[0][:5]}\n")

# Compute cosine similarity between the embeddings
similarity_matrix = cosine_similarity(embeddings_array)

# Print the cosine similarity matrix
print("Cosine Similarity Matrix:")
print(np.round(similarity_matrix, 2))
print("\nDetailed Similarity Results:\n")

# Output comparison between phrases with improved readability
for i in range(len(phrases)):
    for j in range(i + 1, len(phrases)):
        print(f"Cosine similarity between:\n  '{phrases[i]}'\n  and\n  '{phrases[j]}'\n  => {similarity_matrix[i, j]:.4f}\n")

Phrase: Who was the first president of the United States?
First 5 elements of its embedding: [-0.00529869 -0.02196502 -0.01970232 -0.02279548 -0.00797962]

Cosine Similarity Matrix:
[[1.   0.78 0.83 0.76 0.76 0.92 0.75 0.78 0.72 0.73]
 [0.78 1.   0.74 0.75 0.77 0.75 0.94 0.72 0.73 0.73]
 [0.83 0.74 1.   0.75 0.78 0.78 0.72 0.93 0.72 0.74]
 [0.76 0.75 0.75 1.   0.77 0.73 0.74 0.73 0.93 0.75]
 [0.76 0.77 0.78 0.77 1.   0.72 0.75 0.75 0.73 0.93]
 [0.92 0.75 0.78 0.73 0.72 1.   0.79 0.78 0.74 0.76]
 [0.75 0.94 0.72 0.74 0.75 0.79 1.   0.74 0.76 0.78]
 [0.78 0.72 0.93 0.73 0.75 0.78 0.74 1.   0.74 0.76]
 [0.72 0.73 0.72 0.93 0.73 0.74 0.76 0.74 1.   0.76]
 [0.73 0.73 0.74 0.75 0.93 0.76 0.78 0.76 0.76 1.  ]]

Detailed Similarity Results:

Cosine similarity between:
  'Who was the first president of the United States?'
  and
  'What is the capital city of France?'
  => 0.7788

Cosine similarity between:
  'Who was the first president of the United States?'
  and
  'In what year did humans fi

In [9]:
from llama_index.embeddings.openai import OpenAIEmbedding
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import openai

# Set your OpenAI API key
# openai.api_key = 'YOUR_OPENAI_API_KEY'

# Initialize the OpenAI embedding model
embedding_model = OpenAIEmbedding(model="text-embedding-ada-002")

# Define the new space-related questions and answers
phrases = [
    "What year did the first human land on the moon?",
    "Which planet is known as the Red Planet?",
    "What is the largest moon of Saturn?",
    "Who was the first person to travel into space?",
    "What is the name of NASA's rover that landed on Mars in 2021?",
    "The first human landed on the moon in 1969.",
    "The planet known as the Red Planet is Mars.",
    "The largest moon of Saturn is Titan.",
    "Yuri Gagarin was the first person to travel into space.",
    "NASA's rover that landed on Mars in 2021 is named Perseverance."
]

# Generate embeddings for each phrase using OpenAI embeddings
embeddings = embedding_model.get_text_embedding_batch(phrases)

# Convert embeddings to a numpy array
embeddings_array = np.array(embeddings)

# Print the first phrase and the first several elements of its embedding
print(f"Phrase: {phrases[0]}")
print(f"First 5 elements of its embedding: {embeddings_array[0][:5]}\n")

# Compute cosine similarity between the embeddings
similarity_matrix = cosine_similarity(embeddings_array)

# Print the cosine similarity matrix
print("Cosine Similarity Matrix:")
print(np.round(similarity_matrix, 2))
print("\nDetailed Similarity Results:\n")

# Output comparison between phrases with improved readability
for i in range(len(phrases)):
    for j in range(i + 1, len(phrases)):
        print(f"Cosine similarity between:\n  '{phrases[i]}'\n  and\n  '{phrases[j]}'\n  => {similarity_matrix[i, j]:.4f}\n")

Phrase: What year did the first human land on the moon?
First 5 elements of its embedding: [ 0.0055372  -0.03727422  0.00532086 -0.02473242 -0.02219898]

Cosine Similarity Matrix:
[[1.   0.77 0.79 0.86 0.82 0.94 0.76 0.76 0.81 0.79]
 [0.77 1.   0.8  0.79 0.82 0.76 0.95 0.79 0.76 0.8 ]
 [0.79 0.8  1.   0.77 0.77 0.76 0.78 0.94 0.74 0.74]
 [0.86 0.79 0.77 1.   0.8  0.85 0.77 0.75 0.92 0.77]
 [0.82 0.82 0.77 0.8  1.   0.8  0.82 0.74 0.75 0.93]
 [0.94 0.76 0.76 0.85 0.8  1.   0.78 0.78 0.83 0.81]
 [0.76 0.95 0.78 0.77 0.82 0.78 1.   0.81 0.77 0.84]
 [0.76 0.79 0.94 0.75 0.74 0.78 0.81 1.   0.76 0.76]
 [0.81 0.76 0.74 0.92 0.75 0.83 0.77 0.76 1.   0.76]
 [0.79 0.8  0.74 0.77 0.93 0.81 0.84 0.76 0.76 1.  ]]

Detailed Similarity Results:

Cosine similarity between:
  'What year did the first human land on the moon?'
  and
  'Which planet is known as the Red Planet?'
  => 0.7724

Cosine similarity between:
  'What year did the first human land on the moon?'
  and
  'What is the largest moon of

In [10]:
from llama_index.embeddings.openai import OpenAIEmbedding
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import openai

# Set your OpenAI API key
# openai.api_key = 'YOUR_OPENAI_API_KEY'

# Initialize the OpenAI embedding model
embedding_model = OpenAIEmbedding(model="text-embedding-ada-002")

# Define the question and answers (1 correct, 4 closely related wrong ones)
phrases = [
    "What spacecraft was used in the mission to carry the first humans to the moon?",  # Question
    "Apollo 11 was the spacecraft used to carry the first humans to the moon.",       # Correct Answer
    "Apollo 12 was the spacecraft used to carry the first humans to the moon.",         # Wrong Answer
    "Apollo 14 was the spacecraft used to carry astronauts on the third successful moon landing mission.", # Wrong Answer
    "Apollo 10 was the spacecraft used to carry the first humans to the moon.", # Wrong Answer
    "Apollo 16 was the spacecraft that carried astronauts to explore the lunar highlands."   # Wrong Answer
]

# Generate embeddings for the question and answers using OpenAI embeddings
embeddings = embedding_model.get_text_embedding_batch(phrases)

# Convert embeddings to a numpy array
embeddings_array = np.array(embeddings)

# Print the first phrase and the first several elements of its embedding
print(f"Phrase: {phrases[0]}")
print(f"First 5 elements of its embedding: {embeddings_array[0][:5]}\n")

# Compute cosine similarity between the embeddings
similarity_matrix = cosine_similarity(embeddings_array)

print("\nDetailed Similarity Results:\n")

# Output comparison between question and answers with improved readability
for i in range(1, len(phrases)):
    print(f"Cosine similarity between the question and:\n  '{phrases[i]}'\n  => {similarity_matrix[0, i]:.4f}\n")

Phrase: What spacecraft was used in the mission to carry the first humans to the moon?
First 5 elements of its embedding: [ 0.02234936 -0.01276388  0.02098001 -0.01151388 -0.0080214 ]


Detailed Similarity Results:

Cosine similarity between the question and:
  'Apollo 11 was the spacecraft used to carry the first humans to the moon.'
  => 0.9344

Cosine similarity between the question and:
  'Apollo 12 was the spacecraft used to carry the first humans to the moon.'
  => 0.9309

Cosine similarity between the question and:
  'Apollo 14 was the spacecraft used to carry astronauts on the third successful moon landing mission.'
  => 0.8903

Cosine similarity between the question and:
  'Apollo 10 was the spacecraft used to carry the first humans to the moon.'
  => 0.9284

Cosine similarity between the question and:
  'Apollo 16 was the spacecraft that carried astronauts to explore the lunar highlands.'
  => 0.8864

