# Fresh Prompt
- [freshllms/freshqa](https://github.com/freshllms/freshqa)

## セットアップ

In [4]:
from openai import OpenAI
import os
from dotenv import load_dotenv

load_dotenv(verbose=True)

openai_client = OpenAI(
  api_key=os.environ["OPENAI_API_KEY"],
)

In [13]:
import pytz
import datetime

current_date = datetime.datetime.now(
    # pytz.timezone('Asia/Tokyo')
    pytz.timezone("America/Los_Angeles")
).strftime("%B %d, %Y")
print(current_date)

November 14, 2023


In [14]:
temperature = 0.0
max_tokens = 256

def chat_completions(prompt):
    response = openai_client.chat.completions.create(
        model="gpt-4-1106-preview",
        temperature=0.0,
        max_tokens=256,
        messages=[
            {
                "role": "system",
                "content": (
                    "You are a helpful assistant. Answer as concisely as"
                    f" possible. Knowledge cutoff: {current_date}."
                ),
            },
            {"role": "user", "content": "What's today's date?"},
            {
                "role": "assistant",
                "content": f"Today is {current_date} in Pacific Standard Time.",
            },
            {"role": "user", "content": prompt},
        ],
    )
    return response.choices[0].message.content


In [18]:
from serpapi import GoogleSearch

def call_search_engine(query):
  params = {
    "q": query,
    # "location": "California, United States",
    "hl": "en",
    "gl": "us",
    "google_domain": "google.com",
    "api_key": os.environ["SERPAPI_API_KEY"],
  }

  search = GoogleSearch(params)
  return search.get_dict()

In [21]:
import dateutil
import re

def is_date(string, fuzzy=False):
  # Parse a string into a date and check its validity
  try:
      dateutil.parser.parse(string, fuzzy=fuzzy)
      return True
  except ValueError:
      return False


def format_date(d):
  # Standardize the date format for each search result
  date = dateutil.parser.parse(current_date, fuzzy=True).strftime("%b %d, %Y")
  if d is None:
    return None

  for t in ["second", "minute", "hour"]:
    if f"{t} ago" in d or f"{t}s ago" in d:
      return date

  t = "day"
  if f"{t} ago" in d or f"{t}s ago" in d:
    n_days = int(re.search("(\d+) days? ago", d).group(1))
    return (
        datetime.datetime.strptime(date, "%b %d, %Y")
        - datetime.timedelta(days=n_days)
    ).strftime("%b %d, %Y")

  try:
    return dateutil.parser.parse(d, fuzzy=True).strftime("%b %d, %Y")
  except ValueError:
    for x in d.split():
      if is_date(x):
        return dateutil.parser.parse(x, fuzzy=True).strftime("%b %d, %Y")


def extract_source_webpage(link):
  # Extract source webpage
  return (
      link.strip()
      .replace("https://www.", "")
      .replace("http://www.", "")
      .replace("https://", "")
      .replace("http://", "")
      .split("/")[0]
  )


def simplify_displayed_link(displayed_link):
  # Simplify displayed link
  if displayed_link is None:
    return None
  return extract_source_webpage(displayed_link.split(' › ')[0])

In [22]:
def format_search_results(search_data, title_field=None, highlight_field=None):
  # Standardize search results as shown in Figure 3 (left) in the paper
  field = 'snippet_highlighted_words'
  if field in search_data and isinstance(search_data[field], list):
    search_data[field] = ' | '.join(search_data[field])

  field = 'displayed_link'
  if field in search_data:
    search_data[field] = simplify_displayed_link(search_data[field])

  # edge case 1
  if search_data.get('type') == 'local_time':
    source = search_data.get('displayed_link')
    date = format_date(search_data.get('date'))
    title = search_data.get('title')

    snippet = search_data.get('snippet')
    if snippet is None and 'result' in search_data:
      if 'extensions' in search_data and isinstance(
          search_data['extensions'], list
      ):
        snippet = '\n\t'.join(
            [search_data['result']] + search_data['extensions']
        )
      else:
        snippet = search_data['result']

    highlight = search_data.get('snippet_highlighted_words')
    if highlight is None and 'result' in search_data:
      highlight = search_data['result']

  # edge case 2
  elif 'type' in search_data and search_data['type'] == 'population_result':
    source = search_data.get('displayed_link')
    if source is None and 'sources' in search_data:
      if (
          isinstance(search_data['sources'], list)
          and 'link' in search_data['sources'][0]
      ):
        source = extract_source_webpage(search_data['sources'][0]['link'])

    date = format_date(search_data.get('date'))
    if date is None and 'year' in search_data:
      date = format_date(search_data['year'])

    title = search_data.get('title')

    snippet = search_data.get('snippet')
    if snippet is None and 'population' in search_data:
      if 'place' in search_data:
        snippet = '\n\t'.join(
            [
                f"{search_data['place']} / Population",
            ]
            + [
                search_data['population'],
            ]
        )
      else:
        snippet = search_data['population']

    highlight = search_data.get('snippet_highlighted_words')
    if highlight is None and 'population' in search_data:
      highlight = search_data['population']

  else:
    source = search_data.get('displayed_link')
    date = format_date(search_data.get('date'))
    title = (
        search_data.get('title')
        if title_field is None
        else search_data.get(title_field)
    )
    highlight = (
        search_data.get('snippet_highlighted_words')
        if highlight_field is None
        else search_data.get(highlight_field)
    )
    snippet = search_data.get('snippet', '')

    if 'rich_snippet' in search_data:
      for key in ['top', 'bottom']:
        if (
            key in search_data['rich_snippet']
            and 'extensions' in search_data['rich_snippet'][key]
        ):
          snippet = '\n\t'.join(
              [snippet] + search_data['rich_snippet'][key]['extensions']
          )

    if 'list' in search_data:
      assert isinstance(search_data['list'], list)
      snippet = '\n\t'.join([snippet] + search_data['list'])

    if 'contents' in search_data and 'table' in search_data['contents']:
      tbl = search_data['contents']['table']
      assert isinstance(tbl, list)
      snippet += '\n'
      for row in tbl:
        snippet += f'\n{",".join(row)}'

    if snippet is not None and snippet.strip() == '':
      snippet = None

  return {
      'source': source,
      'date': date,
      'title': title,
      'snippet': snippet,
      'highlight': highlight,
  }

In [31]:
def format_knowledge_graph(search_data):
  # Standardize knowledge graphs as shown in Figure 3 (left) in the paper
  source = None
  if "source" in search_data and "link" in search_data["source"]:
    source = extract_source_webpage(search_data["source"]["link"])

  date = None

  title = None
  if "title" in search_data:
    title = search_data["title"]
    if "type" in search_data:
      title += f"\n\t{search_data['type']}"

  snippet = ""
  for field in search_data:
    if (
        (field not in ["title", "type", "kgmid"])
        and ("_link" not in field)
        and ("_stick" not in field)
        and isinstance(search_data[field], str)
        and not search_data[field].startswith("http")
    ):
      snippet += f"\n\t{field}: {search_data[field]}"

  if snippet.strip() == "":
    snippet = None
  else:
    snippet = snippet.strip()

  highlight = None

  return {
      "source": source,
      "date": date,
      "title": title,
      "snippet": snippet,
      "highlight": highlight,
  }


def format_questions_and_answers(search_data):
  # Standardize questions and answers as shown in Figure 3 (left) in the paper
  source = None
  if "link" in search_data:
    source = extract_source_webpage(search_data["link"])

  date = None

  title = None
  if "question" in search_data:
    title = search_data["question"]

  snippet = None
  if "answer" in search_data:
    snippet = search_data["answer"]

  highlight = None

  return {
      "source": source,
      "date": date,
      "title": title,
      "snippet": snippet,
      "highlight": highlight,
  }



## Demo Prompts

In [53]:
# read from file
with open("./data/freshprompt_demo.txt", "r") as f:
    freshprompt_demo = f.read()

print(freshprompt_demo)

query: What year is considered Albert Einstein's annus mirabilis?

source: philsci-archive.pitt.edu
date: None
title: The Turning Point for Einstein's Annus Mirabilis
snippet: The year 1905 has been called Einstein's annus mirabilis in virtue of three ground-breaking works completed over the span of a few months — the light.
highlight: 1905

source: cantorsparadise.com
date: None
title: Einstein's Miraculous Year: A Summary of the 1905 Annus ...
snippet: These are the four papers that Albert Einstein published in 1905, which are considered to be the foundation of modern physics.
highlight: 1905

source: quora.com
date: None
title: Why is 1905 Einstein's miracle year?
snippet: Thus, 1905 is called Einstein's annus mirabilis, or miracle year, and these papers are called his annus mirabilis papers .
highlight: 1905

source: en.wikipedia.org
date: None
title: Annus mirabilis papers
snippet: The annus mirabilis papers are the four papers that Albert Einstein published in Annalen der Physik 

## freshprompt_format method

In [12]:
question = "Who won the latest Nobel Prize in Physics?"

In [10]:
# modelにgpt-4を利用する場合
num_organic_results = 15
num_related_questions = 3
num_questions_and_answers = 3
num_retrieved_evidences = 15

In [9]:
import pandas as pd

df = pd.DataFrame(columns=['source', 'date', 'title', 'snippet', 'highlight'])

In [19]:
search_data = call_search_engine(question)

In [20]:
search_data

{'search_metadata': {'id': '65536ab495bf92860dee7a18',
  'status': 'Success',
  'json_endpoint': 'https://serpapi.com/searches/15cabbac73034b35/65536ab495bf92860dee7a18.json',
  'created_at': '2023-11-14 12:40:20 UTC',
  'processed_at': '2023-11-14 12:40:20 UTC',
  'google_url': 'https://www.google.com/search?q=Who+won+the+latest+Nobel+Prize+in+Physics%3F&oq=Who+won+the+latest+Nobel+Prize+in+Physics%3F&hl=en&gl=us&sourceid=chrome&ie=UTF-8',
  'raw_html_file': 'https://serpapi.com/searches/15cabbac73034b35/65536ab495bf92860dee7a18.html',
  'total_time_taken': 5.67},
 'search_parameters': {'engine': 'google',
  'q': 'Who won the latest Nobel Prize in Physics?',
  'google_domain': 'google.com',
  'hl': 'en',
  'gl': 'us',
  'device': 'desktop'},
 'search_information': {'query_displayed': 'Who won the latest Nobel Prize in Physics?',
  'total_results': 140000000,
  'time_taken_displayed': 0.4,
  'menu_items': [{'position': 1,
    'title': '2023',
    'link': 'https://www.google.com/search?

In [23]:
# Organic results
organic_results = [None] * num_organic_results
for k in range(num_organic_results):
    if (
        'organic_results' in search_data
        and len(search_data['organic_results']) > k
    ):
      organic_results[k] = format_search_results(
          search_data['organic_results'][k]
      )
    else:
      organic_results[k] = format_search_results({})

for d in organic_results[::-1]:
   df = pd.concat([df, pd.DataFrame([d])], ignore_index=True)

In [29]:
print(len(organic_results))
organic_results

15


[{'source': 'nobelprize.org',
  'date': None,
  'title': 'Nobel Prize in Physics Laureates',
  'snippet': 'The three Nobel Prize laureates in physics 2023 are being recognised for ... Read what caused the Royal Swedish Academy of Sciences to exclaim, “Here, at last!',
  'highlight': 'Nobel Prize laureates in physics | last'},
 {'source': 'nobelprize.org',
  'date': None,
  'title': 'All Nobel Prizes in Physics',
  'snippet': 'The Nobel Prize in Physics has been awarded 117 times to 225 Nobel Prize laureates between 1901 and 2023. John Bardeen is the only laureate who has been ...',
  'highlight': 'Nobel Prize in Physics | been awarded | laureates | laureate | been'},
 {'source': 'nobelprize.org',
  'date': None,
  'title': 'The official website of the Nobel Prize - NobelPrize.org',
  'snippet': "The Nobel Prize in Physics 2023 was awarded to Pierre Agostini, Ferenc ... Last year's laureates. 14 laureates were awarded Nobel Prizes in 2022. See a short ...",
  'highlight': 'Nobel Prize i

In [25]:
 # Related questions
related_questions = [None] * num_related_questions
for k in range(num_related_questions):
  if (
      'related_questions' in search_data
      and len(search_data['related_questions']) > k
  ):
    related_questions[k] = format_search_results(
        search_data['related_questions'][k], title_field='question'
    )
  else:
    related_questions[k] = format_search_results({})

for d in related_questions[::-1]:
  df = pd.concat([df, pd.DataFrame([d])], ignore_index=True)


In [30]:
print(len(related_questions))
related_questions

3


[{'source': 'thehindu.com',
  'date': 'Oct 03, 2023',
  'title': 'Who won Nobel Prize 2023 in Physics?',
  'snippet': "The 2023 Nobel Prize in Physics has been awarded to Pierre Agostini, Ferenc Krausz, and Anne L'Huillier “for experimental methods that generate attosecond pulses of light for the study of electro dynamics in matter”, The Royal Swedish Academy of Science announced on October 3, 2023.",
  'highlight': None},
 {'source': 'nytimes.com',
  'date': 'Oct 03, 2023',
  'title': 'Who won the last Nobel Prize Physics?',
  'snippet': "Nobel Prize in Physics Awarded to 3 Scientists for Illuminating How Electrons Move. Techniques resulting from the work of Pierre Agostini, Ferenc Krausz and Anne L'Huillier let scientists capture the motions of subatomic particles moving at impossible speeds.",
  'highlight': None},
 {'source': 'pubs.acs.org',
  'date': 'Oct 09, 2023',
  'title': 'Who won the Nobel Prize in Physics 2023 attosecond?',
  'snippet': "The 2023 Nobel Prize in Physics has 

In [32]:
# Knowledge graph
knowledge_graph = None
if 'knowledge_graph' in search_data:
  knowledge_graph = format_knowledge_graph(search_data['knowledge_graph'])
else:
  knowledge_graph = format_knowledge_graph({})
df = pd.concat([df, pd.DataFrame([knowledge_graph])], ignore_index=True)

In [33]:
knowledge_graph

{'source': None,
 'date': None,
 'title': None,
 'snippet': None,
 'highlight': None}

In [34]:
# Answer box
answer_box = None
if 'answer_box' in search_data:
  answer_box = format_search_results(
      search_data['answer_box'], highlight_field='answer'
  )
else:
  answer_box = format_search_results({})
df = pd.concat([df, pd.DataFrame([answer_box])], ignore_index=True)

In [35]:
answer_box

{'source': 'nobelprize.org',
 'date': 'Oct 03, 2023',
 'title': 'The Nobel Prize in Physics 2023 - NobelPrize.org',
 'snippet': "The Nobel Prize in Physics 2023\n\tNiklas Elmehed © Nobel Prize Outreach. Pierre Agostini. Prize share: 1/3.\n\tNiklas Elmehed © Nobel Prize Outreach. Ferenc Krausz. Prize share: 1/3.\n\tNiklas Elmehed © Nobel Prize Outreach. Anne L'Huillier. Prize share: 1/3.",
 'highlight': None}

In [36]:
# Sort by date
df['date'] = df['date'].apply(lambda x: format_date(x))
df['datetime'] = pd.to_datetime(df['date'], errors='coerce')
df = df.sort_values(by='datetime', na_position='first')
df.replace({pd.NaT: None}, inplace=True)
df = df.dropna(how='all')

In [37]:
df.head()

Unnamed: 0,source,date,title,snippet,highlight,datetime
10,nobelprize.org,,All Nobel Prizes,"Below, you can view the full list of Nobel Pri...",Nobel Prizes | Nobel Prize laureates | prizes ...,
12,nobelprize.org,,The official website of the Nobel Prize - Nobe...,The Nobel Prize in Physics 2023 was awarded to...,Nobel Prize in Physics | was awarded | Last | ...,
13,nobelprize.org,,All Nobel Prizes in Physics,The Nobel Prize in Physics has been awarded 11...,Nobel Prize in Physics | been awarded | laurea...,
14,nobelprize.org,,Nobel Prize in Physics Laureates,The three Nobel Prize laureates in physics 202...,Nobel Prize laureates in physics | last,
8,quantamagazine.org,"Oct 04, 2022",Pioneering Quantum Physicists Win Nobel Prize ...,"Alain Aspect, John Clauser and Anton Zeilinger...",won | Nobel Prize in Physics,2022-10-04 00:00:00


In [38]:
reasoning_and_answer = (
    "\nPlease check if the question contains a valid premise before"
    " answering.\nanswer: "
)

# Select top_k supporting evidences overall
evidences = []

for _, row in df.tail(num_retrieved_evidences).iterrows():
  evidences.append(
      f"""\n\nsource: {row['source']}\ndate: {row['date']}\ntitle: {row['title']}\nsnippet: {row['snippet']}\nhighlight: {row['highlight']}"""
  )

format_fresh_prompt = ''.join(
        [
            f'\n\n\nquery: {question}',
        ]
        + evidences
    ) + f'\n\nquestion: {question}{reasoning_and_answer}'

In [40]:
print(format_fresh_prompt)




query: Who won the latest Nobel Prize in Physics?

source: nobelprize.org
date: None
title: All Nobel Prizes
snippet: Below, you can view the full list of Nobel Prizes and Nobel Prize laureates. Find all prizes in | physics | chemistry | physiology or medicine | literature | ...
highlight: Nobel Prizes | Nobel Prize laureates | prizes | physics

source: nobelprize.org
date: None
title: The official website of the Nobel Prize - NobelPrize.org
snippet: The Nobel Prize in Physics 2023 was awarded to Pierre Agostini, Ferenc ... Last year's laureates. 14 laureates were awarded Nobel Prizes in 2022. See a short ...
highlight: Nobel Prize in Physics | was awarded | Last | laureates | laureates | awarded

source: nobelprize.org
date: None
title: All Nobel Prizes in Physics
snippet: The Nobel Prize in Physics has been awarded 117 times to 225 Nobel Prize laureates between 1901 and 2023. John Bardeen is the only laureate who has been ...
highlight: Nobel Prize in Physics | been awarded | laur

In [47]:
fresh_prompt = freshprompt_demo + format_fresh_prompt

In [49]:
print(fresh_prompt)

query: What year is considered Albert Einstein's annus mirabilis?

source: philsci-archive.pitt.edu
date: None
title: The Turning Point for Einstein's Annus Mirabilis
snippet: The year 1905 has been called Einstein's annus mirabilis in virtue of three ground-breaking works completed over the span of a few months — the light.
highlight: 1905

source: cantorsparadise.com
date: None
title: Einstein's Miraculous Year: A Summary of the 1905 Annus ...
snippet: These are the four papers that Albert Einstein published in 1905, which are considered to be the foundation of modern physics.
highlight: 1905

source: quora.com
date: None
title: Why is 1905 Einstein's miracle year?
snippet: Thus, 1905 is called Einstein's annus mirabilis, or miracle year, and these papers are called his annus mirabilis papers .
highlight: 1905

source: en.wikipedia.org
date: None
title: Annus mirabilis papers
snippet: The annus mirabilis papers are the four papers that Albert Einstein published in Annalen der Physik 

In [50]:
answer = chat_completions(fresh_prompt)

In [51]:
print(answer)

The latest Nobel Prize in Physics, as of November 14, 2023, was awarded to Pierre Agostini, Ferenc Krausz, and Anne L'Huillier for their work on attosecond science.
