#Embedding-Based Retrieval with Deep Lake and OpenAI<br>
OpenAI Reasoning Models such as o1-preview can be used for RAG-Driven Generative AI in the ecosystem.

# 1. Installing the environment

*First run the following cells and restart Google Colab session if prompted. Then run the notebook again cell by cell to explore the code.*

In [1]:
try:
  import deeplake
except:
  !pip install deeplake==3.9.18
  import deeplake



In [2]:
#Google Drive option to store API Keys
#Store you key in a file and read it(you can type it directly in the notebook but it will be visible for somebody next to you)
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install openai==1.40.3

Collecting openai==1.40.3
  Downloading openai-1.40.3-py3-none-any.whl.metadata (22 kB)
Downloading openai-1.40.3-py3-none-any.whl (360 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m360.7/360.7 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.52.2
    Uninstalling openai-1.52.2:
      Successfully uninstalled openai-1.52.2
Successfully installed openai-1.40.3


In [4]:
# For Google Colab and Activeloop while waiting for Activeloop (April 2024) pending new version
#This line writes the string "nameserver 8.8.8.8" to the file. This is specifying that the DNS server the system
#should use is at the IP address 8.8.8.8, which is one of Google's Public DNS servers.
with open('/etc/resolv.conf', 'w') as file:
   file.write("nameserver 8.8.8.8")

In [5]:
#Retrieving and setting the OpenAI API key
f = open("drive/MyDrive/files/api_key.txt", "r")
API_KEY=f.readline().strip()
f.close()

#The OpenAI KeyActiveloop and OpenAI API keys
import os
import openai
os.environ['OPENAI_API_KEY'] =API_KEY
openai.api_key = os.getenv("OPENAI_API_KEY")

In [6]:
#Retrieving and setting the Activeloop API token
f = open("drive/MyDrive/files/activeloop.txt", "r")
API_token=f.readline().strip()
f.close()
ACTIVELOOP_TOKEN=API_token
os.environ['ACTIVELOOP_TOKEN'] =ACTIVELOOP_TOKEN

In [7]:
!pip install sentence-transformers==3.0.1

Collecting sentence-transformers==3.0.1
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
  Attempting uninstall: sentence-transformers
    Found existing installation: sentence-transformers 3.2.1
    Uninstalling sentence-transformers-3.2.1:
      Successfully uninstalled sentence-transformers-3.2.1
Successfully installed sentence-transformers-3.0.1


# Retrieval Augmented Generation

### Initiating the query process

**Replace `hub://denis76/space_exploration_v1` by your organization and dataset name**

In [8]:
vector_store_path = "hub://denis76/space_exploration_v1"

In [9]:
from deeplake.core.vectorstore.deeplake_vectorstore import VectorStore
import deeplake.util
ds = deeplake.load(vector_store_path)

|

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/denis76/space_exploration_v1



/

hub://denis76/space_exploration_v1 loaded successfully.





In [10]:
vector_store = VectorStore(path=vector_store_path)

Deep Lake Dataset in hub://denis76/space_exploration_v1 already exists, loading from the storage


## Input and Query Retrieval

## Input

### Retrieval query

In [11]:
def embedding_function(texts, model="text-embedding-3-small"):
   if isinstance(texts, str):
       texts = [texts]
   texts = [t.replace("\n", " ") for t in texts]
   return [data.embedding for data in openai.embeddings.create(input = texts, model=model).data]

In [12]:
def get_user_prompt():
    # Request user input for the search prompt
    return input("Enter your search query: ")

def search_query(prompt):
    # Assuming `vector_store` and `embedding_function` are already defined
    search_results = vector_store.search(embedding_data=prompt, embedding_function=embedding_function)
    return search_results

# Get the user's search query
#user_prompt = get_user_prompt()
# or enter prompt if it is in a queue
user_prompt="Tell me about space exploration on the Moon and Mars."

# Perform the search
search_results = search_query(user_prompt)

# Print the search results
print(search_results)

{'id': ['741f2712-56fa-11ef-acb7-0242ac1c000c', '741f4d82-56fa-11ef-acb7-0242ac1c000c', '741f35b8-56fa-11ef-acb7-0242ac1c000c', '741f3fea-56fa-11ef-acb7-0242ac1c000c'], 'metadata': [{'source': 'llm.txt'}, {'source': 'llm.txt'}, {'source': 'llm.txt'}, {'source': 'llm.txt'}], 'text': ['Exploration of space, planets, and moons "Space Exploration" redirects here. For the company, see SpaceX . For broader coverage of this topic, see Exploration . Buzz Aldrin taking a core sample of the Moon during the Apollo 11 mission Self-portrait of Curiosity rover on Mars \'s surface Part of a series on Spaceflight History History of spaceflight Space Race Timeline of spaceflight Space probes Lunar missions Mars missions Applications Communications Earth observation Exploration Espionage Military Navigation Settlement Telescopes Tourism Spacecraft Robotic spacecraft Satellite Space probe Cargo spacecraft Crewed spacecraft Apollo Lunar Module Space capsules Space Shuttle Space stations Spaceplanes Vostok

In [13]:
print(user_prompt)

Tell me about space exploration on the Moon and Mars.


In [14]:
# Function to wrap text to a specified width
def wrap_text(text, width=80):
    lines = []
    while len(text) > width:
        split_index = text.rfind(' ', 0, width)
        if split_index == -1:
            split_index = width
        lines.append(text[:split_index])
        text = text[split_index:].strip()
    lines.append(text)
    return '\n'.join(lines)

In [15]:
import textwrap

# Assuming the search results are ordered with the top result first
top_score = search_results['score'][0]
top_text = search_results['text'][0].strip()
top_metadata = search_results['metadata'][0]['source']

# Print the top search result
print("Top Search Result:")
print(f"Score: {top_score}")
print(f"Source: {top_metadata}")
print("Text:")
print(wrap_text(top_text))

Top Search Result:
Score: 0.6017261147499084
Source: llm.txt
Text:
Exploration of space, planets, and moons "Space Exploration" redirects here.
For the company, see SpaceX . For broader coverage of this topic, see
Exploration . Buzz Aldrin taking a core sample of the Moon during the Apollo 11
mission Self-portrait of Curiosity rover on Mars 's surface Part of a series on
Spaceflight History History of spaceflight Space Race Timeline of spaceflight
Space probes Lunar missions Mars missions Applications Communications Earth
observation Exploration Espionage Military Navigation Settlement Telescopes
Tourism Spacecraft Robotic spacecraft Satellite Space probe Cargo spacecraft
Crewed spacecraft Apollo Lunar Module Space capsules Space Shuttle Space
stations Spaceplanes Vostok Space launch Spaceport Launch pad Expendable and
reusable launch vehicles Escape velocity Non-rocket spacelaunch Spaceflight
types Sub-orbital Orbital Interplanetary Interstellar Intergalactic List of
space organizatio

## Augmented Input

In [16]:
augmented_input=user_prompt+" "+top_text

In [17]:
print(augmented_input)

Tell me about space exploration on the Moon and Mars. Exploration of space, planets, and moons "Space Exploration" redirects here. For the company, see SpaceX . For broader coverage of this topic, see Exploration . Buzz Aldrin taking a core sample of the Moon during the Apollo 11 mission Self-portrait of Curiosity rover on Mars 's surface Part of a series on Spaceflight History History of spaceflight Space Race Timeline of spaceflight Space probes Lunar missions Mars missions Applications Communications Earth observation Exploration Espionage Military Navigation Settlement Telescopes Tourism Spacecraft Robotic spacecraft Satellite Space probe Cargo spacecraft Crewed spacecraft Apollo Lunar Module Space capsules Space Shuttle Space stations Spaceplanes Vostok Space launch Spaceport Launch pad Expendable and reusable launch vehicles Escape velocity Non-rocket spacelaunch Spaceflight types Sub-orbital Orbital Interplanetary Interstellar Intergalactic List of space organizations Space agen

# Generation and  output with OpenAI Reasoning models

The ecosystem of this chapter enables the pipeline to perform embedded retrieval, augment the prompt and generate an output with `o1-preview`.

In [36]:
import openai
from openai import OpenAI
import time

client = OpenAI()
gpt_model="o1-preview"
start_time = time.time()  # Start timing before the request

def call_gpt4_with_full_text(itext):
    # Join all lines to form a single string
    text_input = '\n'.join(itext)
    prompt = f"Read the following text as a space exploration expert, then summarize or elaborate on the following content with as much explanation as possibl and different sections:\n{text_input}"


    try:
        response = client.chat.completions.create(
            model=gpt_model,
            messages=[
                {"role": "user", "content": prompt}
            ]
        )
        return response.choices[0].message.content
    except Exception as e:
        return str(e)

gpt4_response = call_gpt4_with_full_text(augmented_input)

response_time = time.time() - start_time  # Measure response time
print(f"Response Time: {response_time:.2f} seconds")  # Print response time

print(gpt_model, "Response:", gpt4_response)

Response Time: 29.52 seconds
o1-preview Response: # Space Exploration on the Moon and Mars

Space exploration has always been a testament to human curiosity and ingenuity. The Moon and Mars, our closest celestial neighbors, have been prime targets for exploration due to their proximity and the wealth of scientific knowledge they promise. This comprehensive overview delves into the history, current missions, future plans, and the challenges associated with exploring these extraterrestrial bodies.

---

## Table of Contents

1. **Introduction**
2. **History of Lunar Exploration**
   - Early Missions
   - The Apollo Program
   - Post-Apollo Era
3. **History of Martian Exploration**
   - Flyby and Orbiter Missions
   - Lander and Rover Missions
4. **Current and Ongoing Missions**
   - Lunar Missions
   - Mars Missions
5. **Future Plans and Programs**
   - Artemis Program
   - Mars Sample Return Missions
   - Human Missions to Mars
6. **Scientific Discoveries and Significance**
   - Lunar S

### Formatted response

In [37]:
import textwrap
import re
from IPython.display import display, Markdown, HTML
import markdown

def print_formatted_response(response):
    # Check for markdown by looking for patterns like headers, bold, lists, etc.
    markdown_patterns = [
        r"^#+\s",           # Headers
        r"^\*+",            # Bullet points
        r"\*\*",            # Bold
        r"_",               # Italics
        r"\[.+\]\(.+\)",    # Links
        r"-\s",             # Dashes used for lists
        r"\`\`\`"           # Code blocks
    ]

    # If any pattern matches, assume the response is in markdown
    if any(re.search(pattern, response, re.MULTILINE) for pattern in markdown_patterns):
        # Markdown detected, convert to HTML for nicer display
        html_output = markdown.markdown(response)
        display(HTML(html_output))  # Use display(HTML()) to render HTML in Colab
    else:
        # No markdown detected, wrap and print as plain text
        wrapper = textwrap.TextWrapper(width=80)
        wrapped_text = wrapper.fill(text=response)

        print("Text Response:")
        print("--------------------")
        print(wrapped_text)
        print("--------------------\n")

print_formatted_response(gpt4_response)

# Evaluating the output with  Cosine Similarity

with initial user prompt

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_cosine_similarity(text1, text2):
    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform([text1, text2])
    similarity = cosine_similarity(tfidf[0:1], tfidf[1:2])
    return similarity[0][0]

similarity_score = calculate_cosine_similarity(user_prompt, gpt4_response)

print(f"Cosine Similarity Score: {similarity_score:.3f}")

Cosine Similarity Score: 0.419


with augmented user prompt

In [34]:
similarity_score = calculate_cosine_similarity(augmented_input, gpt4_response)

print(f"Cosine Similarity Score: {similarity_score:.3f}")

Cosine Similarity Score: 0.453


In [22]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [35]:
def calculate_cosine_similarity_with_embeddings(text1, text2):
    embeddings1 = model.encode(text1)
    embeddings2 = model.encode(text2)
    similarity = cosine_similarity([embeddings1], [embeddings2])
    return similarity[0][0]


similarity_score = calculate_cosine_similarity_with_embeddings(augmented_input, gpt4_response)
print(f"Cosine Similarity Score: {similarity_score:.3f}")

Cosine Similarity Score: 0.653
