#Embedding-Based Retrieval with Deep Lake and OpenAI

# 1. Installing the environment

*First run the following cells and restart Google Colab session if prompted. Then run the notebook again cell by cell to explore the code.*

In [None]:
try:
  import deeplake
except:
  !pip install deeplake==3.9.18
  import deeplake

In [None]:
#Google Drive option to store API Keys
#Store you key in a file and read it(you can type it directly in the notebook but it will be visible for somebody next to you)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install openai==1.40.3

Collecting openai==1.40.3
  Downloading openai-1.40.3-py3-none-any.whl.metadata (22 kB)
Collecting httpx<1,>=0.23.0 (from openai==1.40.3)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting jiter<1,>=0.4.0 (from openai==1.40.3)
  Downloading jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai==1.40.3)
  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai==1.40.3)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading openai-1.40.3-py3-none-any.whl (360 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m360.7/360.7 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.5

In [None]:
# For Google Colab and Activeloop while waiting for Activeloop (April 2024) pending new version
#This line writes the string "nameserver 8.8.8.8" to the file. This is specifying that the DNS server the system
#should use is at the IP address 8.8.8.8, which is one of Google's Public DNS servers.
with open('/etc/resolv.conf', 'w') as file:
   file.write("nameserver 8.8.8.8")

In [None]:
#Retrieving and setting the OpenAI API key
f = open("drive/MyDrive/files/api_key.txt", "r")
API_KEY=f.readline().strip()
f.close()

#The OpenAI KeyActiveloop and OpenAI API keys
import os
import openai
os.environ['OPENAI_API_KEY'] =API_KEY
openai.api_key = os.getenv("OPENAI_API_KEY")

In [None]:
#Retrieving and setting the Activeloop API token
f = open("drive/MyDrive/files/activeloop.txt", "r")
API_token=f.readline().strip()
f.close()
ACTIVELOOP_TOKEN=API_token
os.environ['ACTIVELOOP_TOKEN'] =ACTIVELOOP_TOKEN

In [None]:
!pip install sentence-transformers==3.0.1

Collecting sentence-transformers==3.0.1
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers==3.0.1)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers==3.0.1)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers==3.0.1)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers==3.0.1)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transformers==3.0.1)
  Using cached nvidia_cublas_cu12-1

# Retrieval Augmented Generation

### Initiating the query process

**Replace `hub://denis76/space_exploration_v1` by your organization and dataset name**

In [None]:
vector_store_path = "hub://denis76/space_exploration_v1"

In [None]:
from deeplake.core.vectorstore.deeplake_vectorstore import VectorStore
import deeplake.util
ds = deeplake.load(vector_store_path)

|

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/denis76/space_exploration_v1



\

hub://denis76/space_exploration_v1 loaded successfully.



 

In [None]:
vector_store = VectorStore(path=vector_store_path)

Deep Lake Dataset in hub://denis76/space_exploration_v1 already exists, loading from the storage


## Input and Query Retrieval

## Input

### Retrieval query

In [None]:
def embedding_function(texts, model="text-embedding-3-small"):
   if isinstance(texts, str):
       texts = [texts]
   texts = [t.replace("\n", " ") for t in texts]
   return [data.embedding for data in openai.embeddings.create(input = texts, model=model).data]

In [None]:
def get_user_prompt():
    # Request user input for the search prompt
    return input("Enter your search query: ")

def search_query(prompt):
    # Assuming `vector_store` and `embedding_function` are already defined
    search_results = vector_store.search(embedding_data=prompt, embedding_function=embedding_function)
    return search_results

# Get the user's search query
#user_prompt = get_user_prompt()
# or enter prompt if it is in a queue
user_prompt="Tell me about space exploration on the Moon and Mars."

# Perform the search
search_results = search_query(user_prompt)

# Print the search results
print(search_results)

{'id': ['741f2712-56fa-11ef-acb7-0242ac1c000c', '741f4d82-56fa-11ef-acb7-0242ac1c000c', '741f35b8-56fa-11ef-acb7-0242ac1c000c', '741f3fea-56fa-11ef-acb7-0242ac1c000c'], 'metadata': [{'source': 'llm.txt'}, {'source': 'llm.txt'}, {'source': 'llm.txt'}, {'source': 'llm.txt'}], 'text': ['Exploration of space, planets, and moons "Space Exploration" redirects here. For the company, see SpaceX . For broader coverage of this topic, see Exploration . Buzz Aldrin taking a core sample of the Moon during the Apollo 11 mission Self-portrait of Curiosity rover on Mars \'s surface Part of a series on Spaceflight History History of spaceflight Space Race Timeline of spaceflight Space probes Lunar missions Mars missions Applications Communications Earth observation Exploration Espionage Military Navigation Settlement Telescopes Tourism Spacecraft Robotic spacecraft Satellite Space probe Cargo spacecraft Crewed spacecraft Apollo Lunar Module Space capsules Space Shuttle Space stations Spaceplanes Vostok

In [None]:
print(user_prompt)

Tell me about space exploration on the Moon and Mars.


In [None]:
# Function to wrap text to a specified width
def wrap_text(text, width=80):
    lines = []
    while len(text) > width:
        split_index = text.rfind(' ', 0, width)
        if split_index == -1:
            split_index = width
        lines.append(text[:split_index])
        text = text[split_index:].strip()
    lines.append(text)
    return '\n'.join(lines)

In [None]:
import textwrap

# Assuming the search results are ordered with the top result first
top_score = search_results['score'][0]
top_text = search_results['text'][0].strip()
top_metadata = search_results['metadata'][0]['source']

# Print the top search result
print("Top Search Result:")
print(f"Score: {top_score}")
print(f"Source: {top_metadata}")
print("Text:")
print(wrap_text(top_text))

Top Search Result:
Score: 0.6016581654548645
Source: llm.txt
Text:
Exploration of space, planets, and moons "Space Exploration" redirects here.
For the company, see SpaceX . For broader coverage of this topic, see
Exploration . Buzz Aldrin taking a core sample of the Moon during the Apollo 11
mission Self-portrait of Curiosity rover on Mars 's surface Part of a series on
Spaceflight History History of spaceflight Space Race Timeline of spaceflight
Space probes Lunar missions Mars missions Applications Communications Earth
observation Exploration Espionage Military Navigation Settlement Telescopes
Tourism Spacecraft Robotic spacecraft Satellite Space probe Cargo spacecraft
Crewed spacecraft Apollo Lunar Module Space capsules Space Shuttle Space
stations Spaceplanes Vostok Space launch Spaceport Launch pad Expendable and
reusable launch vehicles Escape velocity Non-rocket spacelaunch Spaceflight
types Sub-orbital Orbital Interplanetary Interstellar Intergalactic List of
space organizatio

## Augmented Input

In [None]:
augmented_input=user_prompt+" "+top_text

In [None]:
print(augmented_input)

Tell me about space exploration on the Moon and Mars. Exploration of space, planets, and moons "Space Exploration" redirects here. For the company, see SpaceX . For broader coverage of this topic, see Exploration . Buzz Aldrin taking a core sample of the Moon during the Apollo 11 mission Self-portrait of Curiosity rover on Mars 's surface Part of a series on Spaceflight History History of spaceflight Space Race Timeline of spaceflight Space probes Lunar missions Mars missions Applications Communications Earth observation Exploration Espionage Military Navigation Settlement Telescopes Tourism Spacecraft Robotic spacecraft Satellite Space probe Cargo spacecraft Crewed spacecraft Apollo Lunar Module Space capsules Space Shuttle Space stations Spaceplanes Vostok Space launch Spaceport Launch pad Expendable and reusable launch vehicles Escape velocity Non-rocket spacelaunch Spaceflight types Sub-orbital Orbital Interplanetary Interstellar Intergalactic List of space organizations Space agen

# Generation and  output

In [None]:
import openai
from openai import OpenAI
import time

client = OpenAI()
gpt_model="gpt-4o"
start_time = time.time()  # Start timing before the request

def call_gpt4_with_full_text(itext):
    # Join all lines to form a single string
    text_input = '\n'.join(itext)
    prompt = f"Please summarize or elaborate on the following content:\n{text_input}"

    try:
        response = client.chat.completions.create(
            model=gpt_model,
            messages=[
                {"role": "system", "content": "You are a space exploration expert."},
                {"role": "assistant", "content": "You can read the input and answer in detail."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.1  # Fine-tune parameters as needed
        )
        return response.choices[0].message.content
    except Exception as e:
        return str(e)

gpt4_response = call_gpt4_with_full_text(augmented_input)

response_time = time.time() - start_time  # Measure response time
print(f"Response Time: {response_time:.2f} seconds")  # Print response time

print(gpt_model, "Response:", gpt4_response)

Response Time: 8.44 seconds
gpt-4o Response: Space exploration on the Moon and Mars has been a significant focus of human spaceflight and robotic missions. Here's a detailed summary of the key points:

### Moon Exploration
1. **Historical Missions**:
   - **Apollo Missions**: NASA's Apollo program, particularly Apollo 11, marked the first manned Moon landing in 1969. Astronauts like Buzz Aldrin collected core samples and conducted experiments.
   - **Lunar Missions**: Various missions have been conducted to explore the Moon, including robotic landers and orbiters from different countries.

2. **Scientific Goals**:
   - **Geological Studies**: Understanding the Moon's composition, structure, and history.
   - **Resource Utilization**: Investigating the potential for mining resources like Helium-3 and water ice.

3. **Future Plans**:
   - **Artemis Program**: NASA's initiative to return humans to the Moon and establish a sustainable presence by the late 2020s.
   - **International Collab

### Formatted response

In [None]:
import textwrap
import re
from IPython.display import display, Markdown, HTML
import markdown

def print_formatted_response(response):
    # Check for markdown by looking for patterns like headers, bold, lists, etc.
    markdown_patterns = [
        r"^#+\s",           # Headers
        r"^\*+",            # Bullet points
        r"\*\*",            # Bold
        r"_",               # Italics
        r"\[.+\]\(.+\)",    # Links
        r"-\s",             # Dashes used for lists
        r"\`\`\`"           # Code blocks
    ]

    # If any pattern matches, assume the response is in markdown
    if any(re.search(pattern, response, re.MULTILINE) for pattern in markdown_patterns):
        # Markdown detected, convert to HTML for nicer display
        html_output = markdown.markdown(response)
        display(HTML(html_output))  # Use display(HTML()) to render HTML in Colab
    else:
        # No markdown detected, wrap and print as plain text
        wrapper = textwrap.TextWrapper(width=80)
        wrapped_text = wrapper.fill(text=response)

        print("Text Response:")
        print("--------------------")
        print(wrapped_text)
        print("--------------------\n")

print_formatted_response(gpt4_response)

# Evaluating the output with  Cosine Similarity

with initial user prompt

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_cosine_similarity(text1, text2):
    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform([text1, text2])
    similarity = cosine_similarity(tfidf[0:1], tfidf[1:2])
    return similarity[0][0]

similarity_score = calculate_cosine_similarity(user_prompt, gpt4_response)

print(f"Cosine Similarity Score: {similarity_score:.3f}")

Cosine Similarity Score: 0.396


with augmented user prompt

In [None]:
similarity_score = calculate_cosine_similarity(augmented_input, gpt4_response)

print(f"Cosine Similarity Score: {similarity_score:.3f}")

Cosine Similarity Score: 0.857


In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
def calculate_cosine_similarity_with_embeddings(text1, text2):
    embeddings1 = model.encode(text1)
    embeddings2 = model.encode(text2)
    similarity = cosine_similarity([embeddings1], [embeddings2])
    return similarity[0][0]


similarity_score = calculate_cosine_similarity_with_embeddings(augmented_input, gpt4_response)
print(f"Cosine Similarity Score: {similarity_score:.3f}")

Cosine Similarity Score: 0.739
