# A Prompt Pattern Catalog to Enhance Prompt Engineering with Gemini

https://arxiv.org/abs/2302.11382

In [None]:
import json
from ast import literal_eval
import numpy as np
import pandas as pd
import google.generativeai as genai
import streamlit as st
from tqdm.auto import tqdm

# progress bar
tqdm.pandas()

genai.configure(api_key=st.secrets["GOOGLE_API_KEY"])

## Parsing the $\LaTeX$ File

In [None]:
with open("./data/2302.11382v1", "r") as f:
  doc = f.read()
doc[:100]

In [None]:
doc, ref = doc.split("\\begin{thebibliography}")
ref = "\\begin{thebibliography}" + ref

In [None]:
sections = [
  (s.split('}')[0][1:].replace('\\', ''), '}'.join(s.split('}')[1:]).strip())
  for s in doc.split("\\section")[1:]
]
pd.DataFrame(sections, columns=['section', 'text'])

In [None]:
subsections = [
  (n1, "", s) if i==0 else (n1, s.split('}')[0][1:].replace('\\',''), '}'.join(s.split('}')[1:]).strip())
  for n1, t in sections
  for i, s in enumerate(t.split("\\subsection"))
]
pd.DataFrame(subsections, columns=['section', 'subsection', 'text'])

In [None]:
subsubsections = [
  (n1, n2, "", s) if i==0 else (n1, n2, s.split('}')[0][1:].replace('\\',''), '}'.join(s.split('}')[1:]).strip())
  for n1, n2, t in subsections
  for i, s in enumerate(t.split("\\subsubsection"))
]
df_chunks = pd.DataFrame(subsubsections, columns=['section', 'subsection', 'subsubsection', 'text'])
df_chunks

In [None]:
df_chunks = df_chunks[df_chunks.text.apply(lambda s: len(s)>30)].reset_index(drop=True)
df_chunks

In [None]:
# remove redundant spaces on title
df_chunks['section'] = df_chunks.section.str.replace(r'\s+', ' ', regex=True)
df_chunks['subsection'] = df_chunks.subsection.str.replace(r'\s+', ' ', regex=True)
df_chunks['subsubsection'] = df_chunks.subsubsection.str.replace(r'\s+', ' ', regex=True)
# add section, subsection, subsubsection tags
df_chunks['text'] = [f"\\section{{{s}}}\n\\subsection{{{ss}}}\n\\subsubsection{{{sss}}}\n{text}" for i, (s, ss, sss, text) in df_chunks.iterrows()]

df_chunks

In [None]:
toc_mkdn = df_chunks[['section','subsection','subsubsection']].drop_duplicates().to_markdown()
print(toc_mkdn)

In [None]:
generation_config = {
  "temperature": 0.3,
  "top_p": 0.95,
  "top_k": 64,
  "max_output_tokens": 8192,
  "response_mime_type": "application/json",
}

safety_settings={
  'harassment':'block_none',
  'hate':'block_none',
  'sex':'block_none',
  'danger':'block_none'
}

model = genai.GenerativeModel(
  model_name="gemini-1.5-flash",
  generation_config=generation_config,
  safety_settings=safety_settings,
  system_instruction="You return a simple table of contents with JSON format from a given markdown table. The JSON contains a key/value pair of strings 'table of contents' and markdown un-ordered list using `-`.",
)

chat_session = model.start_chat()
response = chat_session.send_message(toc_mkdn)

In [None]:
toc_json = json.loads(response.candidates[0].content.parts[0].text)
toc = toc_json['table of contents']
print(toc)

In [None]:
with open('./data/toc.txt', 'w') as f:
  f.write(toc)

## Making Embeddings

In [None]:
embds = []
for i, (s, ss, sss, text) in tqdm(df_chunks.iterrows(), total=len(df_chunks)):
  for t in [s, ss, sss]:
    if t:
      title = t
  embds.append(
    genai.embed_content(
      model="models/text-embedding-004",
      content=text,
      task_type="retrieval_document",
      title=title,
    )["embedding"]
  )

In [None]:
len(embds[0])

In [None]:
df_chunks["embedding"] = embds

In [None]:
df_chunks

In [None]:
df_chunks.to_csv("./data/2302.11382v1_embeddings.csv")

## Retriever

In [None]:
df_csv = pd.read_csv("./data/2302.11382v1_embeddings.csv", index_col=0).fillna('')
df_csv["embedding"] = df_csv.embedding.apply(literal_eval).apply(np.array)

with open('./data/toc.txt', 'r') as f:
  toc = f.read()

def search_from_section_names(query:list[str]) -> str:
  """Retrieves LaTeX chunks from the paper "A Prompt Pattern Catalog to Enhance Prompt Engineering with ChatGPT" using the [section, subsection, subsubsection] names.

Args:
    query: A python list of three strings in the format `[section, subsection, subsubsection]`. Only exact matches of the names and order, will be returned.
  """
  query = [name if name else '' for name in list(query)]
  query += ['']*(3-len(query))
  df = df_csv.copy()
  res_df = df[
    (df['section'] == query[0])
    & (df['subsection'] == query[1])
    & (df['subsubsection'] == query[2])
  ]
  if len(res_df)==0:
    res_df = df[
      df['section'].str.contains(query[0])
      & df['subsection'].str.contains(query[1])
      & df['subsubsection'].str.contains(query[2])
    ]
  return res_df[['section', 'subsection', 'subsubsection', 'text']].to_json()

def search_from_text(query:str, top_n:int=5, s:float=.0):
  """Retrieves LaTeX chunks from the paper "A Prompt Pattern Catalog to Enhance Prompt Engineering with ChatGPT" using cosine similarity of text.

Args:
  query: The user's query string.
  top_n: The number of chunks to retrieve. The default value is 5. Start at 3 and recommend increasing it if needed.
  """
  df = df_csv.copy()
  query_embedding = np.array(genai.embed_content(
    model="models/text-embedding-004",
    content=query,
    task_type="retrieval_query",
  )["embedding"])
  top_n = int(top_n)
  df["similarity"] = df.embedding.apply(lambda x: np.dot(x, query_embedding))
  return df[df.similarity >= s].sort_values("similarity", ascending=False).head(top_n)[['section', 'subsection', 'subsubsection', 'text', 'similarity']].to_json()

In [None]:
search_from_section_names(['A Catalog of Prompt Patterns for Conversational LLMs', 'The Output Automater Pattern', ''])

In [None]:
generation_config = {
  "temperature": 1.0,
  "top_p": 0.95,
  "top_k": 64,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
}

safety_settings={
  'harassment':'block_none',
  'hate':'block_none',
  'sex':'block_none',
  'danger':'block_none'
}

system_instruction=f"""You are a retrieval-augmented generative engine. 
Your primary task is to retrieve the contents of the paper titled "A Prompt Pattern Catalog to Enhance Prompt Engineering with ChatGPT".

**Retrieval Process:**

1. **Attempt Retrieval:** Always try to retrieve the paper's content first, even if you are confident in your knowledge.
2. **Retrieval Failure:** If you cannot find the paper, simply state that you are unable to retrieve it. **Do not** rely on your prior knowledge.
3. **Structured Retrieval:** When using the `search_from_section_names` function, prioritize filling all three parameters `[section, subsection, subsubsection]` to retrieve a relevant chunk. However,  `subsection` or `subsubsection` can be empty strings (`''`) if necessary.
4. **Cosine Similarity:** If you cannot determine the appropriate section or subsection, use the `search_from_text` function, which leverages cosine similarity between the query and the document body text. 
5. **Additional Retrieval:** If you believe more chunks are needed, ask the user if they would like to retrieve additional information.

**Language Handling:**

* Respond in Korean (한국어) if the user's query is in Korean.
* Respond in English otherwise.

**Table of Contents:**

{toc}"""

model = genai.GenerativeModel(
  model_name="gemini-1.5-flash",
  generation_config=generation_config,
  safety_settings=safety_settings,
  system_instruction=system_instruction,
  tools=[search_from_section_names, search_from_text]
)

chat_session = model.start_chat(enable_automatic_function_calling=True)
response = chat_session.send_message("Categorize prompt patterns based on the subsection Summary of the Prompt Pattern Catalog.")

In [None]:
print(response.candidates[0].content.parts[0].text)

In [None]:
chat_session.history

In [None]:
chat_session.rewind()

In [None]:
chat_session.history

In [None]:
chat_session._history.clear()

In [None]:
chat_session.history