# Fresh Prompt Demo

In [2]:
import pytz
import datetime

current_date = datetime.datetime.now(
    # pytz.timezone('Asia/Tokyo')
    pytz.timezone("America/Los_Angeles")
).strftime("%B %d, %Y")
print(current_date)

November 14, 2023


In [5]:
from serpapi import GoogleSearch
import os

def call_search_engine(query):
  params = {
    "q": query,
    # "location": "California, United States",
    "hl": "en",
    "gl": "us",
    "google_domain": "google.com",
    "api_key": os.environ["SERPAPI_API_KEY"],
  }

  search = GoogleSearch(params)
  return search.get_dict()

In [10]:
demo_questions = [
    "What year is considered Albert Einstein's annus mirabilis?",
    "Which photographer took the most expensive photograph in the world?",
    "How many days are left until the 2023 Grammy Awards?",
    "How many years ago did the Boxing Day Tsunami happen?",
    (
        "When did Amazon become the first publicly traded company to exceed a"
        " market value of $3 trillion?"
    ),
]

concise_demo_reasonings_and_answers = [
    (
        "1905 is considered Albert Einstein's annus mirabilis, his miraculous"
        " year."
    ),
    (
        'The most expensive photograph in the world is "Le Violon d\'Ingres".'
        " The photograph was created by Man Ray."
    ),
    (
        "The 2023 Grammy Awards ceremony was held on February 5, 2023. Thus,"
        " the ceremony has already taken place."
    ),
    (
        "The disaster occurred on December 26, 2004. Thus, it happened 18 years"
        " ago."
    ),
    "Amazon's market capitalization has never exceeded $3 trillion.",
]

verbose_demo_reasonings_and_answers = [
    (
        "In the year of 1905, Albert Einstein published four groundbreaking"
        " papers that revolutionized scientific understanding of the universe."
        " Thus, scientists call 1905 Albert Einstein's annus mirabilis — his"
        " year of miracles."
    ),
    (
        "Man Ray's famed \"Le Violon d'Ingres\" became the most expensive"
        " photograph ever to sell at auction, sold for $12.4 million on May"
        " 14th, 2022 at Christie's New York. The black and white image, taken"
        " in 1924 by the American surrealist artist, transforms a woman's naked"
        " body into a violin by overlaying the picture of her back with"
        " f-holes. Thus, Man Ray is the photographer who took the most"
        " expensive photograph in the world."
    ),
    (
        "The 2023 Grammy Awards, officially known as the 65th Annual Grammy"
        " Awards ceremony, was held in Los Angeles on February 5, 2023. Thus,"
        " the event has already taken place."
    ),
    (
        "The Boxing Day Tsunami refers to the 2004 Indian Ocean earthquake and"
        " tsunami, which is one of the deadliest natural disasters in recorded"
        " history, killing an estimated 230,000 people across 14 countries. The"
        " disaster occurred on December 26, 2004, which is 18 years ago."
    ),
    (
        "Amazon's market capitalization hit a peak of roughly $1.9 trillion in"
        " July 2021. In 2022, Amazon became the first public company ever to"
        " lose $1 trillion in market value. Thus, Amazon's market value has"
        " never exceeded $3 trillion. In fact, Apple became the first publicly"
        " traded U.S. company to exceed a market value of $3 trillion in"
        " January 2022."
    ),
]

prefix = (
    f"\nanswer: As of today {current_date}, the most up-to-date and relevant"
    " information regarding this query is as follows. "
)

concise_demo_reasonings_and_answers = [
    prefix + x for x in concise_demo_reasonings_and_answers
]
verbose_demo_reasonings_and_answers = [
    prefix + x for x in verbose_demo_reasonings_and_answers
]

demo_search_data = [call_search_engine(q) for q in demo_questions]

In [7]:
import dateutil
import re

def is_date(string, fuzzy=False):
  # Parse a string into a date and check its validity
  try:
      dateutil.parser.parse(string, fuzzy=fuzzy)
      return True
  except ValueError:
      return False


def format_date(d):
  # Standardize the date format for each search result
  date = dateutil.parser.parse(current_date, fuzzy=True).strftime("%b %d, %Y")
  if d is None:
    return None

  for t in ["second", "minute", "hour"]:
    if f"{t} ago" in d or f"{t}s ago" in d:
      return date

  t = "day"
  if f"{t} ago" in d or f"{t}s ago" in d:
    n_days = int(re.search("(\d+) days? ago", d).group(1))
    return (
        datetime.datetime.strptime(date, "%b %d, %Y")
        - datetime.timedelta(days=n_days)
    ).strftime("%b %d, %Y")

  try:
    return dateutil.parser.parse(d, fuzzy=True).strftime("%b %d, %Y")
  except ValueError:
    for x in d.split():
      if is_date(x):
        return dateutil.parser.parse(x, fuzzy=True).strftime("%b %d, %Y")


def extract_source_webpage(link):
  # Extract source webpage
  return (
      link.strip()
      .replace("https://www.", "")
      .replace("http://www.", "")
      .replace("https://", "")
      .replace("http://", "")
      .split("/")[0]
  )


def simplify_displayed_link(displayed_link):
  # Simplify displayed link
  if displayed_link is None:
    return None
  return extract_source_webpage(displayed_link.split(' › ')[0])

def format_search_results(search_data, title_field=None, highlight_field=None):
  # Standardize search results as shown in Figure 3 (left) in the paper
  field = 'snippet_highlighted_words'
  if field in search_data and isinstance(search_data[field], list):
    search_data[field] = ' | '.join(search_data[field])

  field = 'displayed_link'
  if field in search_data:
    search_data[field] = simplify_displayed_link(search_data[field])

  # edge case 1
  if search_data.get('type') == 'local_time':
    source = search_data.get('displayed_link')
    date = format_date(search_data.get('date'))
    title = search_data.get('title')

    snippet = search_data.get('snippet')
    if snippet is None and 'result' in search_data:
      if 'extensions' in search_data and isinstance(
          search_data['extensions'], list
      ):
        snippet = '\n\t'.join(
            [search_data['result']] + search_data['extensions']
        )
      else:
        snippet = search_data['result']

    highlight = search_data.get('snippet_highlighted_words')
    if highlight is None and 'result' in search_data:
      highlight = search_data['result']

  # edge case 2
  elif 'type' in search_data and search_data['type'] == 'population_result':
    source = search_data.get('displayed_link')
    if source is None and 'sources' in search_data:
      if (
          isinstance(search_data['sources'], list)
          and 'link' in search_data['sources'][0]
      ):
        source = extract_source_webpage(search_data['sources'][0]['link'])

    date = format_date(search_data.get('date'))
    if date is None and 'year' in search_data:
      date = format_date(search_data['year'])

    title = search_data.get('title')

    snippet = search_data.get('snippet')
    if snippet is None and 'population' in search_data:
      if 'place' in search_data:
        snippet = '\n\t'.join(
            [
                f"{search_data['place']} / Population",
            ]
            + [
                search_data['population'],
            ]
        )
      else:
        snippet = search_data['population']

    highlight = search_data.get('snippet_highlighted_words')
    if highlight is None and 'population' in search_data:
      highlight = search_data['population']

  else:
    source = search_data.get('displayed_link')
    date = format_date(search_data.get('date'))
    title = (
        search_data.get('title')
        if title_field is None
        else search_data.get(title_field)
    )
    highlight = (
        search_data.get('snippet_highlighted_words')
        if highlight_field is None
        else search_data.get(highlight_field)
    )
    snippet = search_data.get('snippet', '')

    if 'rich_snippet' in search_data:
      for key in ['top', 'bottom']:
        if (
            key in search_data['rich_snippet']
            and 'extensions' in search_data['rich_snippet'][key]
        ):
          snippet = '\n\t'.join(
              [snippet] + search_data['rich_snippet'][key]['extensions']
          )

    if 'list' in search_data:
      assert isinstance(search_data['list'], list)
      snippet = '\n\t'.join([snippet] + search_data['list'])

    if 'contents' in search_data and 'table' in search_data['contents']:
      tbl = search_data['contents']['table']
      assert isinstance(tbl, list)
      snippet += '\n'
      for row in tbl:
        snippet += f'\n{",".join(row)}'

    if snippet is not None and snippet.strip() == '':
      snippet = None

  return {
      'source': source,
      'date': date,
      'title': title,
      'snippet': snippet,
      'highlight': highlight,
  }

In [12]:
def format_knowledge_graph(search_data):
  # Standardize knowledge graphs as shown in Figure 3 (left) in the paper
  source = None
  if "source" in search_data and "link" in search_data["source"]:
    source = extract_source_webpage(search_data["source"]["link"])

  date = None

  title = None
  if "title" in search_data:
    title = search_data["title"]
    if "type" in search_data:
      title += f"\n\t{search_data['type']}"

  snippet = ""
  for field in search_data:
    if (
        (field not in ["title", "type", "kgmid"])
        and ("_link" not in field)
        and ("_stick" not in field)
        and isinstance(search_data[field], str)
        and not search_data[field].startswith("http")
    ):
      snippet += f"\n\t{field}: {search_data[field]}"

  if snippet.strip() == "":
    snippet = None
  else:
    snippet = snippet.strip()

  highlight = None

  return {
      "source": source,
      "date": date,
      "title": title,
      "snippet": snippet,
      "highlight": highlight,
  }


def format_questions_and_answers(search_data):
  # Standardize questions and answers as shown in Figure 3 (left) in the paper
  source = None
  if "link" in search_data:
    source = extract_source_webpage(search_data["link"])

  date = None

  title = None
  if "question" in search_data:
    title = search_data["question"]

  snippet = None
  if "answer" in search_data:
    snippet = search_data["answer"]

  highlight = None

  return {
      "source": source,
      "date": date,
      "title": title,
      "snippet": snippet,
      "highlight": highlight,
  }



In [8]:
import pandas as pd

num_organic_results = 15
num_related_questions = 3
num_questions_and_answers = 3
num_retrieved_evidences = 15

def freshprompt_format(
    question,
    search_data,
    reasoning_and_answer,
):
  """Build FreshPrompt for each question

  Args:
    question: The question to process.
    search_data: Search data.
    reasoning_and_answer: The reasoning and answer.
    num_organic_results: Number of organic results to keep.
    num_related_questions: Number of related questions to keep.
    num_questions_and_answers: Number of questions and answers to keep.
    num_retrieved_evidences: Number of retrieved evidences to keep.

  Returns:
    A prompt that incorporates retrieved evidences for each question.
  """

  df = pd.DataFrame(columns=['source', 'date', 'title', 'snippet', 'highlight'])

  # Organic results
  organic_results = [None] * num_organic_results
  for k in range(num_organic_results):
    if (
        'organic_results' in search_data
        and len(search_data['organic_results']) > k
    ):
      organic_results[k] = format_search_results(
          search_data['organic_results'][k]
      )
    else:
      organic_results[k] = format_search_results({})

  for d in organic_results[::-1]:
    df = pd.concat([df, pd.DataFrame([d])], ignore_index=True)

  # Related questions
  related_questions = [None] * num_related_questions
  for k in range(num_related_questions):
    if (
        'related_questions' in search_data
        and len(search_data['related_questions']) > k
    ):
      related_questions[k] = format_search_results(
          search_data['related_questions'][k], title_field='question'
      )
    else:
      related_questions[k] = format_search_results({})

  for d in related_questions[::-1]:
    df = pd.concat([df, pd.DataFrame([d])], ignore_index=True)

  # Questions and Answers
  questions_and_answers = [None] * num_questions_and_answers
  for k in range(num_questions_and_answers):
    if (
        'questions_and_answers' in search_data
        and len(search_data['questions_and_answers']) > k
    ):
      questions_and_answers[k] = format_questions_and_answers(
          search_data['questions_and_answers'][k]
      )
    else:
      questions_and_answers[k] = format_questions_and_answers({})

  for d in questions_and_answers[::-1]:
    df = pd.concat([df, pd.DataFrame([d])], ignore_index=True)

  # Knowledge graph
  knowledge_graph = None
  if 'knowledge_graph' in search_data:
    knowledge_graph = format_knowledge_graph(search_data['knowledge_graph'])
  else:
    knowledge_graph = format_knowledge_graph({})
  df = pd.concat([df, pd.DataFrame([knowledge_graph])], ignore_index=True)

  # Answer box
  answer_box = None
  if 'answer_box' in search_data:
    answer_box = format_search_results(
        search_data['answer_box'], highlight_field='answer'
    )
  else:
    answer_box = format_search_results({})
  df = pd.concat([df, pd.DataFrame([answer_box])], ignore_index=True)

  # Sort by date
  df['date'] = df['date'].apply(lambda x: format_date(x))
  df['datetime'] = pd.to_datetime(df['date'], errors='coerce')
  df = df.sort_values(by='datetime', na_position='first')
  df.replace({pd.NaT: None}, inplace=True)
  df = df.dropna(how='all')

  # Select top_k supporting evidences overall
  evidences = []

  for _, row in df.tail(num_retrieved_evidences).iterrows():
    evidences.append(
        f"""\n\nsource: {row['source']}\ndate: {row['date']}\ntitle: {row['title']}\nsnippet: {row['snippet']}\nhighlight: {row['highlight']}"""
    )

  return (
      ''.join(
          [
              f'\n\n\nquery: {question}',
          ]
          + evidences
      )
      + f'\n\nquestion: {question}{reasoning_and_answer}'
  )

In [13]:
demo_prompts = []
for q, s, ra in zip(
    demo_questions, demo_search_data, concise_demo_reasonings_and_answers
):
    demo_prompts.append(
    freshprompt_format(
        q,
        s,
        ra,
    )
    )

freshprompt_demo = ''.join(demo_prompts).strip()

In [14]:
print(freshprompt_demo)

query: What year is considered Albert Einstein's annus mirabilis?

source: philsci-archive.pitt.edu
date: None
title: The Turning Point for Einstein's Annus Mirabilis
snippet: The year 1905 has been called Einstein's annus mirabilis in virtue of three ground-breaking works completed over the span of a few months — the light.
highlight: 1905

source: cantorsparadise.com
date: None
title: Einstein's Miraculous Year: A Summary of the 1905 Annus ...
snippet: These are the four papers that Albert Einstein published in 1905, which are considered to be the foundation of modern physics.
highlight: 1905

source: quora.com
date: None
title: Why is 1905 Einstein's miracle year?
snippet: Thus, 1905 is called Einstein's annus mirabilis, or miracle year, and these papers are called his annus mirabilis papers .
highlight: 1905

source: en.wikipedia.org
date: None
title: Annus mirabilis papers
snippet: The annus mirabilis papers are the four papers that Albert Einstein published in Annalen der Physik 

In [15]:
# write to file
with open("./data/freshprompt_demo.txt", "w") as f:
    f.write(freshprompt_demo)