## [Chapter 15: Foundation Models and Emerging Search Paradigms]
## Generative AI and the Search Frontier

In [2]:
import html
import json
import pickle
import sys
import warnings

import pandas
import spacy
from aips import *
from aips.spark import create_view_from_collection
from IPython.display import HTML, display
from pyspark.sql import SparkSession
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import ArrayType, FloatType
import numpy

warnings.filterwarnings("ignore") #Some operations warn inside a loop, we'll only need to see the first warning
sys.path.append("..")

engine = get_engine()
outdoors_collection = engine.get_collection("outdoors")
spark = SparkSession.builder.appName("AIPS").getOrCreate()

**Note**: This notebook depends upon the Outdoors dataset. If you have any issues, please rerun the [Setting up the Outdoors Dataset](../ch13/1.setting-up-the-outdoors-dataset.ipynb) notebook.

## Mocked Generative Output

For this notebook, we mock responses from a large generative model.

We encourage you to explore connecting to a generative model API of your choice, and comparing results!

In [13]:
import html
mockedGenerativeResponses = pandas.read_csv("mockedGenerativeResponses.csv")
def get_generative_response(prompt):
    prompt = prompt.replace("""
""", "\r\n")
    prompt=prompt.replace("\n", "\n") #marshall new lines into consistent format
    response = mockedGenerativeResponses.loc[mockedGenerativeResponses["prompt"] == prompt, "response"].values
    if len(response) > 0:
        output = f'''
        <div style="overflow: auto; margin-bottom: 10px;">
          <h3 style="float: left; width: 100px; margin-right: 10px;">Query:</h3>
          <p style="overflow: hidden; margin-left: 110px;"><pre>{html.escape(prompt)}<pre></p>
        </div>

        <div style="overflow: auto; margin-bottom: 10px;">
          <h3 style="float: left; width: 100px; margin-right: 10px;">Response:</h3>
          <p style="overflow: hidden; margin-left: 110px;"><pre>{html.escape(response[0])}</pre></p>
        </div>
        '''
        display(HTML(output))
        return response[0]
    else:
        print(f"\n\nSorry! your prompt does not have a mocked value.")
        return ""

## Retriever from Listing 14.6

See [../ch14/2.ch14-question-answering-CPU-data-preparation.ipynb#Listing-14.6](../ch14/2.ch14-question-answering-CPU-data-preparation.ipynb#Listing-14.6)

In [4]:
nlp = spacy.load("en_core_web_sm")
nlp.remove_pipe("ner")
nlp.add_pipe("merge_noun_chunks")
determiners = "all an another any both del each either every half la many much nary neither no some such that the them these this those".split(" ")
def get_query_from_question(question):
    query = []
    doc = nlp(question)
    for tok in doc:
        if tok.pos_ in ["NOUN","VERB"]:
            query.append(tok.text)
    if not len(query):
        query = [question]
    query = " ".join(query)
    for d in determiners:
        query = query.replace(" "+d+" ","")
    return query

In [5]:
def retriever(question):
    contexts={"id":[],"question":[],"context":[],"url":[]}
    query = get_query_from_question(question)
    request = {
        "query": query,
        "query_fields": ["body"],
        "return_fields": ["id", "url", "body"],
        "filters": [("post_type", "answer")],
        "limit": 5
    }
    docs = outdoors_collection.search(**request)["docs"]
    for doc in docs:
        contexts["id"].append(doc["id"])
        contexts["url"].append(doc["url"])
        contexts["question"].append(question)
        contexts["context"].append(doc["body"])
    return pandas.DataFrame(contexts)

## Listing 15.1

In [14]:
r = get_generative_response("What is a unicorn?")

## Listing 15.2

In [20]:
r = get_generative_response("""You are a chatbot named AIPS chat. 
You have a Ph.D. in biology. 
What is a unicorn?""")

## Listing 15.3

## Listing 15.4

(The prompt and responses for Listings 15.3 and 15.4 are both represented in one cell)

In [21]:
r = get_generative_response("""Web search results:

[1] "A large language model, or LLM, is a deep learning algorithm that 
can recognize, summarize, translate, predict and generate text and other 
content based on knowledge gained from massive datasets."
URL: https://blogs.nvidia.com/blog/2023/01/26/what-are-large-language-models-used-for/

[2] A large language model (LLM) is a language model consisting of a 
neural network with many parameters (typically billions of weights or 
more), trained on large quantities of unlabeled text using 
self-supervised learning. LLMs emerged around 2018 and perform well at a 
wide variety of tasks. This has shifted the focus of natural language 
processing research away from the previous paradigm of training 
specialized supervised models for specific tasks.
URL: https://en.wikipedia.org/wiki/Large_language_model

[3] "In this guide, we'll discuss everything you need to know about 
Large Language Models (LLMs), including key terms, algorithms, 
fine-tuning, and more. As ChatGPT has taken the internet by storm crossing 
1 million users in its first 5 days, you may be wondering what machine 
learning algorithm is running under the hood. While ChatGPT uses a 
specific type of reinforcement learning called "Reinforcement Learning 
from Human Feedback (RLHF)", at a high level it is an example of a 
Large Language Model (LLM)."
URL: https://www.mlq.ai/what-is-a-large-language-model-llm/

Instructions: Using the provided web search results, write a comprehensive 
reply to the given query. Make sure to cite results using 
[[number](URL)] notation after the reference. If the provided search 
results refer to multiple subjects with the same name, write separate 
answers for each subject.

Query: What is a large language model?""")

## Listing 15.5

In [22]:
r = get_generative_response("""Web search results:

[1] "A large language model, or LLM, is a deep learning algorithm that 
can recognize, summarize, translate, predict and generate text and other 
content based on knowledge gained from massive datasets."
URL: https://blogs.nvidia.com/blog/2023/01/26/what-are-large-language-models-used-for/

[2] A large language model (LLM) is a language model consisting of a 
neural network with many parameters (typically billions of weights or 
more), trained on large quantities of unlabeled text using 
self-supervised learning. LLMs emerged around 2018 and perform well at a 
wide variety of tasks. This has shifted the focus of natural language 
processing research away from the previous paradigm of training 
specialized supervised models for specific tasks.
URL: https://en.wikipedia.org/wiki/Large_language_model

[3] "In this guide, we'll discuss everything you need to know about 
Large Language Models (LLMs), including key terms, algorithms, 
fine-tuning, and more. As ChatGPT has taken the internet by storm crossing 
1 million users in its first 5 days, you may be wondering what machine 
learning algorithm is running under the hood. While ChatGPT uses a 
specific type of reinforcement learning called "Reinforcement Learning 
from Human Feedback (RLHF)", at a high level it is an example of a 
Large Language Model (LLM)."
URL: https://www.mlq.ai/what-is-a-large-language-model-llm/

Instructions: Using the provided web search results, write a comprehensive 
reply to the given query. Make sure to cite results using 
[[number](URL)] notation after the reference. If the provided search 
results refer to multiple subjects with the same name, write separate 
answers for each subject.

Query: What is a large language model?  Be concise.""")

## Listing 15.6

In [23]:
summarize_search_prompt = """What are some queries that should find the following documents?  List at least 5 unique queries, where these documents are better than others in an outdoors question and answer dataset.  Be concise and only output the list of queries, and a result number in the format [n] for the best result in the resultset.  Don't print a relevance summary at the end.

### Results:
{resultset}"""
print(summarize_search_prompt)

What are some queries that should find the following documents?  List at least 5 unique queries, where these documents are better than others in an outdoors question and answer dataset.  Be concise and only output the list of queries, and a result number in the format [n] for the best result in the resultset.  Don't print a relevance summary at the end.

### Results:
{resultset}


## Listing 15.7

In [24]:
example_contexts = retriever("What are minimalist shoes?")
resultset = [f"{idx}. {ctx}" for idx, ctx in enumerate(list(example_contexts[0:5]["context"]))]
print("\n".join(resultset))

0. Minimalist shoes or "barefoot" shoes are shoes that provide your feet with some form of protection, but get you as close to a barefoot experience as possible. The styles range from simple ultralight sneakers, to thin almost slipper like shoes, all the way down to stick on shoes: image source image source
1. There was actually a project done on the definition of what a minimalist shoe is and the result was "Footwear providing minimal interference with the natural movement of the foot due to its high flexibility, low heel to toe drop, weight and stack height, and the absence of motion control and stability devices". If you are looking for a simpler definition, this is what Wikipedia says, Minimalist shoes are shoes intended to closely approximate barefoot running conditions. 1 They have reduced cushioning, thin soles, and are of lighter weight than other running shoes, allowing for more sensory contact for the foot on the ground while simultaneously providing the feet with some protec

## Listing 15.8

In [25]:
resultset_text = "\n".join(resultset)
resultset_prompt = summarize_search_prompt.replace("{resultset}", resultset_text)
generated_relevance_judgments = get_generative_response(resultset_prompt)

## Listing 15.9

In [26]:
import re

def extract_pairwise_judgments(text, contexts):
    query_pattern = re.compile(r"\d+\.\s+(.*)")
    result_pattern = re.compile(r"\d+\.\s+\[(\d+)\]")    
    lines = text.split("\n")
    queries = []
    results = []
    for line in lines:
        query_match = query_pattern.match(line)
        result_match = result_pattern.match(line)
        if result_match:
            result_index = int(result_match.group(1))
            results.append(result_index)            
        elif query_match:
            query = query_match.group(1)
            queries.append(query)            
    output = [{"query": query, "relevant_document": contexts[result]["id"]}
              for query, result in zip(queries, results)]
    return output

## Listing 15.10

In [27]:
resultset_contexts = example_contexts.to_dict("records")
extract_pairwise_judgments(generated_relevance_judgments, resultset_contexts)

[{'query': 'What is the definition of a minimalist shoe?\r',
  'relevant_document': '18370'},
 {'query': 'What are the characteristics of minimalist shoes?\r',
  'relevant_document': '18376'},
 {'query': 'Which shoes are best for walking on rocky beaches?\r',
  'relevant_document': '16427'},
 {'query': 'Are minimalist shoes suitable for all terrains?\r',
  'relevant_document': '18375'},
 {'query': 'What are some recommended barefoot shoe brands?\r',
  'relevant_document': '13540'}]

## Listing 15.11

Article source: Alan Morrell, Democrat and Chronicle, May 29, 2023 https://www.democratandchronicle.com/story/money/business/2023/05/29/carvel-is-gone-from-rochester-what-happened-to-the-ice-cream-chain/70252613007/

In [28]:
news_article_snippet = "Walter Silverman of Brighton owned one of the most successful local Carvel franchises, at East Ridge Road and Hudson Avenue in Irondequoit. He started working for Carvel in 1952. This is how it appeared in the late 1970s/early 1980s."
news_article_labelled = """<per>Walter Silverman</per> of <loc>Brighton</loc> owned one of the most successful local <org>Carvel</org> franchises, at <loc>East Ridge Road</loc> and <loc>Hudson Avenue</loc> in <loc>Irondequoit</loc>. He started working for <org>Carvel</org> in 1952. This is how it appeared in the late 1970s/early 1980s."""
print(news_article_labelled)

<per>Walter Silverman</per> of <loc>Brighton</loc> owned one of the most successful local <org>Carvel</org> franchises, at <loc>East Ridge Road</loc> and <loc>Hudson Avenue</loc> in <loc>Irondequoit</loc>. He started working for <org>Carvel</org> in 1952. This is how it appeared in the late 1970s/early 1980s.


## Listing 15.12

In [32]:
def extract_entities(text):
    entities = []
    pattern = r"<(per|loc|org)>(.*?)<\/(per|loc|org)>"
    matches = re.finditer(pattern, text)
    for match in matches:
        entity = {
            "label": match.group(1).upper(),
            "offset": [match.start(), match.end() - 1],
            "text": match.group(2)
        }
        entities.append(entity)
    return entities

In [33]:
#Our manually labelled article
extract_entities(news_article_labelled)

[{'label': 'PER', 'offset': [0, 26], 'text': 'Walter Silverman'},
 {'label': 'LOC', 'offset': [31, 49], 'text': 'Brighton'},
 {'label': 'ORG', 'offset': [90, 106], 'text': 'Carvel'},
 {'label': 'LOC', 'offset': [123, 148], 'text': 'East Ridge Road'},
 {'label': 'LOC', 'offset': [154, 177], 'text': 'Hudson Avenue'},
 {'label': 'LOC', 'offset': [182, 203], 'text': 'Irondequoit'},
 {'label': 'ORG', 'offset': [229, 245], 'text': 'Carvel'}]

## Listing 15.13

In [34]:
entities_prompt = """For a given passage, please identify and mark the following entities: people with the tag '<per>', locations with the tag '<loc>', and organizations with the tag '<org>'. Please repeat the passage below with the appropriate markup.
### {text}"""
entities_prompt_news_article = entities_prompt.replace("{text}",news_article_snippet)
news_article_generated_labelled = get_generative_response(entities_prompt_news_article)
extract_entities(news_article_generated_labelled)

[{'label': 'PER', 'offset': [0, 26], 'text': 'Walter Silverman'},
 {'label': 'LOC', 'offset': [31, 49], 'text': 'Brighton'},
 {'label': 'ORG', 'offset': [90, 106], 'text': 'Carvel'},
 {'label': 'LOC', 'offset': [123, 148], 'text': 'East Ridge Road'},
 {'label': 'LOC', 'offset': [154, 177], 'text': 'Hudson Avenue'},
 {'label': 'LOC', 'offset': [182, 203], 'text': 'Irondequoit'},
 {'label': 'ORG', 'offset': [229, 245], 'text': 'Carvel'}]

## Listing 15.14

In [None]:
! pip install gdown
def open_image_file(movie_id):    
    url = 'https://drive.google.com/uc?id=0B9P1L--7Wd2vNm9zMTJWOGxobkU'
    output = '20150428_collected_images.tgz'
    gdown.download(url, output, quiet=False)

In [None]:
def load_image(movie_id):
    image = open_image()
    
drive.mount('/drive')
movies = pandas.read_csv('/drive/MyDrive/tmdb-images/content/data2.csv')

In [None]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def normalize_embedding(embedding):
    return numpy.divide(embedding,
                        numpy.linalg.norm(embedding,axis=0))
                      
def compute_text_embedding(text):
    inputs = processor(text=[text], return_tensors="pt", padding=True)
    return normalize_embedding(model.get_text_features(**inputs))

def compute_image_embedding(image):
    inputs = processor(images=[image], return_tensors="pt", padding=True)
    return normalize_embedding(model.get_image_features(**inputs))

def compute_movie_embedding(id):
    return normalize_embedding(get_cached_movie_embedding(id))

def index_movies_with_image_embeddings():
    encoder = UserDefinedFunction(compute_movie_embedding,
                                  ArrayType(FloatType()))

    create_view_from_collection(engine.get_collection("tmdb"), "tmdb")
    tmdb_dataframe = spark.sql("""SELECT * FROM tmdb""")
    embeddings_dataframe = tmdb_dataframe.withColumn(
        "image_embedding", encoder(tmdb_dataframe.id))

    collection = engine.create_collection("tmdb_with_embeddings")
    collection.write(embeddings_dataframe)
    return collection

index_movies_with_image_embeddings()

## Listing 15.15

In [None]:
def movie_search(query_embedding, limit=16):
    collection = engine.get_collection("tmdb_with_embeddings")
    request = {
        "query_vector": query_embedding,
        "query_field": "image_embeddings",
        "limit": limit,
        "quantization_size": "FLOAT32"}
    return collection.vector_search(**request)

def text_to_image_search(query):
    query_embedding = compute_text_embedding(query).detach().numpy()
    return movie_search(query_embedding)

def image_to_image_search(image):
    image_embedding = compute_image_embedding(image).detach().numpy()
    return movie_search(image_embedding)

def text_and_image_to_image_search(text_query, image):
    normalized_text_query_embedding = compute_text_embedding(text_query).detach().numpy()
    normalized_image_embedding = compute_image_embedding(image).detach().numpy()
    pooled_embedding = numpy.average(
        [normalized_text_query_embedding,
         normalized_image_embedding], axis=0)
    return movie_search(pooled_embedding)