## Semantic Search with Dense Vector Embeddings

In [None]:
import os
import pickle
import numpy
import pandas
from aips import *
from aips.spark import create_view_from_collection, get_spark_session
import aips.indexer
from aips.data_loaders.outdoors import load_dataframe
import sentence_transformers
import torch

aips.set_engine("opensearch")
engine = get_engine()
spark = get_spark_session()
outdoors_collection = aips.indexer.build_collection(engine, "outdoors")

## Load and clean the Outdoors dataset

## Transformer time!

In [None]:
from sentence_transformers import SentenceTransformer
transformer = SentenceTransformer("roberta-base-nli-stsb-mean-tokens")
cache_name = "outdoors_semantic_search_embeddings"

In [None]:
def get_embeddings(texts, cache_name, ignore_cache=False):
    cache_file_name = f"data/embeddings/{cache_name}.pickle"
    if ignore_cache or not os.path.isfile(cache_file_name):
        embeddings = transformer.encode(texts)
        os.makedirs(os.path.dirname(cache_file_name), exist_ok=True)
        with open(cache_file_name, "wb") as fd:
            pickle.dump(embeddings, fd)
    else:
        with open(cache_file_name, "rb") as fd:
            embeddings = pickle.load(fd)
    return embeddings

In [None]:
def rank_similarities(phrases, similarities, name=None):
    a_phrases = []
    b_phrases = []
    scores = []
    for a in range(len(similarities) - 1):
        for b in range(a + 1, len(similarities)):
            a_phrases.append(phrases[a])
            b_phrases.append(phrases[b])
            scores.append(float(similarities[a][b]))
    dataframe = pandas.DataFrame({"score": scores,
                                  "phrase a": a_phrases, "phrase b": b_phrases})
    dataframe = dataframe.sort_values(by=["score"], ascending=False,
                                    ignore_index=True)
    dataframe["idx"] = dataframe.index
    return dataframe.reindex(columns=["idx", "score", "phrase a", "phrase b"])

## Listing 13.16

In [None]:
outdoors_dataframe = load_dataframe("data/outdoors/posts.csv")
titles = outdoors_dataframe.rdd.map(lambda x: x.title).collect()
titles = list(filter(None, titles))
embeddings = get_embeddings(titles, cache_name)

print(f"Number of embeddings: {len(embeddings)}")
print(f"Dimensions per embedding: {len(embeddings[0])}")

### Explore the top similarities for the titles

In [None]:
def normalize_embedding(embedding):
    normalized = numpy.divide(embedding, numpy.linalg.norm(embedding))
    return list(map(float, normalized))

In [None]:
#Find the pairs with the highest dot product scores
normalized_embeddings = list(map(normalize_embedding, embeddings))
similarities = sentence_transformers.util.dot_score(normalized_embeddings[0:100], normalized_embeddings[0:100])

comparisons = rank_similarities(titles, similarities)
display(HTML(comparisons[:10].to_html(index=False)))

## Listing 13.19
Perform vector search utilizing our configured search engine

In [None]:
cache_name = "all_outdoors_title_embeddings"

def display_results(query, search_results):    
    display(HTML(f"<h4>Results for: <em>{query}</em></h4>"))
    fields = [(d["title"], d["body"], d["score"]) for d in search_results]
    for l, b, d in fields:
        print(str(int(d * 1000) / 1000), "|", l, b)
    
def index_outdoor_title_embeddings():
    create_view_from_collection(engine.get_collection("outdoors"),
                                "outdoors")
    outdoors_dataframe = spark.sql("""SELECT id, title, body FROM outdoors
                                      WHERE title IS NOT NULL""")
    print(f"Calculating embeddings for {outdoors_dataframe.count()} docs.")
    ids = outdoors_dataframe.rdd.map(lambda x: x.id).collect()
    titles = outdoors_dataframe.rdd.map(lambda x: x.title).collect()
    body = outdoors_dataframe.rdd.map(lambda x: x.body).collect()
    embeddings = list(map(normalize_embedding,
                          get_embeddings(titles, cache_name)))
    embeddings_dataframe = spark.createDataFrame(zip(ids, titles, body, embeddings),
                                   schema=["id", "title", "body", "title_embedding"])
    
    collection = engine.create_collection("outdoors_with_embeddings")
    print(f"Writing {embeddings_dataframe.count()} docs to \"{collection.name}\" collection")
    collection.write(embeddings_dataframe)
    return collection
        
def semantic_search_with_engine(collection, query, limit=10):
    query_vector = transformer.encode(query)
    query_vector = normalize_embedding(query_vector)
    request = {"query": query_vector,
               "query_fields": ["title_embedding"],
               "return_fields": ["title", "body", "score", "title_embedding"],
               "quantization_size": "FLOAT32",
               "limit": limit}
    response = collection.search(**request)    
    return response["docs"]

In [None]:
embeddings_collection = index_outdoor_title_embeddings()

In [None]:
query = "what are minimal shoes?"
search_results = semantic_search_with_engine(embeddings_collection, query)
display_results(query, search_results)

# Actually RAG

In [None]:
!pip install openai==2.8 markdown2==2.5.4

In [None]:
import markdown2
import openai
gpt = openai.OpenAI(api_key="YOUR_KEY_HERE")

In [None]:
def get_prompt(query,search_results,k=5):

    context = [f"""[{idx+1}] {r["title"]}: {r["body"]}\n\n""" for idx,r in enumerate(search_results[:k])]
    
    return f"""# Instructions

For the given user query and search results, create a helpful summary of the results relevant to the query.
    
## User Query: {query}

## Search Results:
{context}

## Summary Generation :
- Generate a comprehensive summary of the user's query topic using the provided search results.
- Use the reference tags (e.g., [1], [2]) to cite specific information from the search results in the summary.
- Ensure all information is cross-referenced for consistency. Avoid including contradictory statements.
- Prioritize factual accuracy, grounding the summary in the content of the provided search results.
- Structure the summary with an introductory overview, detailed exploration of key points, and a concluding statement.

Please create a summary following these guidelines to ensure consistency and accuracy.

ANSWER:""".strip()

In [None]:
def RAG(query,embeddings_collection,k=5):

    #Run the search
    search_results = semantic_search_with_engine(embeddings_collection, query)

    #Get the prompt with the search results
    prompt = get_prompt(query,search_results)

    #Get the summary from OpenAI with the prompt
    gpt_res = gpt.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="gpt-4o-mini",
        temperature=0
    )

    #We get the summary back from GPT.
    #print(gpt_res)
    summary = gpt_res.choices[0].message.content
    summary_html = markdown2.markdown(summary)
    
    # Show the Summary and Results with some HTML
    html_str = f'<div style="color:#339;border:1px solid #333;"><h3>Summary by GPT-4o-mini</h3>{summary_html}</div>'

    html_str += f"<h4>Showing {len(search_results)} Results for <em>{query}</em></h4><ol>"

    for idx,result in enumerate(search_results[:k]):
        score = result.get("_score")
        title = result.get("title", "No title")
        body = result.get("body", None)
        snippet = body if body else title[:140]+"..."
        
        # Format each result as an HTML list item
        html_str += f'<li><b>{title}</b>({score})<br>{snippet}</li>'
    
    html_str += "</ol>"
    
    # Display the HTML in the Jupyter Notebook
    display(HTML(html_str))

In [None]:
RAG("ideal footwear for hikes",embeddings_collection)