In [None]:
#@title You will need an OpenAI API key, or access to some other LLM
OPENAI_KEY = "..."

A  simple notebook for testing out OpenAI's document summarization abilities using gpt-3.5, and a Gradio interface. This allows you do upload a PDF document, extract the text, then try various prompt designs, chunk sizes, and output sizes to see what affects Summary quality.

https://colab.research.google.com/drive/1PwyFmopC1D588aHyYE0Htf2cp8IZwuAk?usp=sharing#scrollTo=iV4BgiWzzOh-

Be aware, this code has NO error handling, and can probably break fairly easily!

Default behavior uses gpt-3.5-turbo for summarizing document chunks, then gpt-3.5-turbo-16k in order to perform the final summarization. This can be modified fairly easily to have, for example, 3.5-16k doing summaries, and then feeding the results into gpt-4 for higher quality results. Assuming one has access to gpt-4...

###Disclaimer
Unless required by applicable law or agreed to in writing, the code provided in this notebook is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

In [None]:
!pip install -q gradio langchain unstructured pdf2image openai tiktoken

In [None]:
import gradio as gr
from langchain.llms import OpenAI
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain import PromptTemplate
from langchain.docstore import document



In [None]:
import tiktoken
tokenizer = tiktoken.get_encoding('cl100k_base')
def tiktoken_len(text):
  tokens = tokenizer.encode(
      text,
      disallowed_special=()
  )
  return len(tokens)

In [None]:
#@title All-in-One 1.0

summary_template_default = """Context:{text}
As an experienced legal analyst, review the Context which is part of a long document to answer the question.
{question}

Create a detailed and comprehensive outline of the Context with as many facts, descriptions, explanations, reasonings, previous case citations and their relationship to this case, as you can.

"""
final_template_default = """Context:{text}
As an experienced legal analyst, use the Context and the question:
{question}

Compose a case brief using only information from the Context.
Working step by step, organize the Context outline into well-structured paragraphs in the following sections:

Case: [name of the case, court, year]
Questions Presented: The issues the Court must resolve.
Facts of the Case: the parties, facts and events leading to this case.
Procedural History: [district court case summary, appeals court case summary, how this issue reached this Court]
Analysis: subsections["Rules": detailed explanation of how the Court's considers relevant statutes, interpretations, standards, and tests applicable to this case, "case law": names of cases reviewed by the Court and analysis of how the Court relates those cases to the Questions Presented, "Application": detailed explanation of how the Rules and Case Law help the Court reach its conclusions]
Conclusion: the Court's ruling on the Questions Presented.
Dissent:  How it disagrees with the holding of this case.

"""

#def summarize_it(question, chunk_s, out_s, doc):
def summarize_it(question, chunk_s, out_s, doc, summary_template, final_template, token_ratio):
  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size = chunk_s, #chunk_s, # number of units per chunk
      chunk_overlap = 0, # number of units of overlap
      length_function = tiktoken_len, #trying to use tokens as chunking unit instead of characters.
      separators=['\n\n', '\n', ' ', ''] # our chosen operators for separating
      )
  texts = text_splitter.split_text(doc)

  s_prompt = PromptTemplate(
      input_variables=["text", "question"],
      template=summary_template
  )

  f_prompt = PromptTemplate(
      input_variables=["text", "question"],
      template=final_template
  )

#@title Use this for OpenAI latest: gpt-3.5-turbo, bigger context: gpt-3.5-turbo-16k, functions: gpt-3.5-turbo-0613
#summarization takes chunk_s worth of tokens and returns at max that amount divided by 3. So using the 16k model we should be able to summarize a ~35k token document.
  llms = OpenAI(temperature=0, openai_api_key=OPENAI_KEY, model_name="gpt-3.5-turbo", max_tokens=int(chunk_s // token_ratio))
  llmf = OpenAI(temperature=0, openai_api_key=OPENAI_KEY, model_name="gpt-3.5-turbo-16k", max_tokens=out_s)
  summarized_texts = ""
  for text in texts:
    summary_prompt = s_prompt.format(text=text, question=question)
    summary = llms(summary_prompt)
#    summarized_texts.append(summary)
    summarized_texts += summary + "\n"
#  return summarized_texts
  final_prompt = f_prompt.format(text=summarized_texts, question=question)
  final_summary = llmf(final_prompt)
  summarized_tokens = int(tiktoken_len(summarized_texts))
  return final_summary, summarized_texts, summarized_tokens

# function call to rerun the Final Summary
def resummarize_it(question, out_s, summarized_texts, final_template):
  f_prompt = PromptTemplate(
      input_variables=["text", "question"],
      template=final_template
  )
  llmf = OpenAI(temperature=0, openai_api_key=OPENAI_KEY, model_name="gpt-3.5-turbo-16k", max_tokens=out_s)
  final_prompt = f_prompt.format(text=summarized_texts, question=question)
  final_summary = llmf(final_prompt)
  return final_summary

#function that selects PDF, extracts text to list, extracts page_content, counts tokens, sets token ratio.

def upload_file(files, token_ratio):
  page_content = []
  loader = UnstructuredPDFLoader(files.name)
  docs_raw = loader.load()
  doc_content = docs_raw[0].page_content[:]
  docs_tokens = int(tiktoken_len(doc_content))

# This line here lets you tune the max token ratio!!!
  new_token_ratio = round(docs_tokens / 12500, 1)
  if new_token_ratio > token_ratio:
    token_ratio = new_token_ratio
  else:
    token_ratio = 2.5
  return doc_content, docs_tokens, token_ratio

def set_template(radio_b, summary_working, final_working):
  temp_template = None
  if radio_b == "Summarize":
    temp_template = summary_working
  elif radio_b == "Final":
    temp_template = final_working
  else:
    temp_template = None
  return temp_template

def custom_prompt(radio_b, custom_prompt, summary_working, final_working):
  if radio_b == "Summarize":
    summary_working = custom_prompt
  elif radio_b == "Final":
    final_working = custom_prompt
  else:
    custom_prompt = None
  return custom_prompt, summary_working, final_working

def restore_templates(radio_b):
  summary_working = summary_template_default
  final_working = final_template_default
  if radio_b == "Summarize":
    return summary_working, summary_working, final_working
  if radio_b == "Final":
    return final_working, summary_working, final_working
  else:
    return None, summary_working, final_working

with gr.Blocks() as demo:
  with gr.Row():
    with gr.Column(scale=2.5):
      with gr.Tab("Summarizer"):
        doc = gr.Textbox(lines=12, max_lines=15, label="Input Document", show_copy_button=True)
        question = gr.Textbox(label="Question")
        with gr.Row():
          doc_tokens = gr.Textbox(label="Tokens", scale=1)
          chunk_s = gr.Slider(400, 2800, step=100, label="Input Chunk Size", scale=4)
          out_s = gr.Slider(500, 4000, step=250, label="Output Summary Size", scale=4)

# Document upload thingy

        with gr.Row():
          token_ratio = gr.State(2.5)
          upl_btn = gr.UploadButton("Upload", file_types=[".pdf"], file_count="single", size="sm")
          upl_btn.upload(fn=upload_file, inputs=[upl_btn, token_ratio], outputs=[doc, doc_tokens, token_ratio])
          sub_btn = gr.Button("Submit", variant="primary", size="sm")
        gr.Markdown(
            """
            Long Document Summarizer using gpt-3.5-turbo. Click "Upload" to upload a PDF document. The plain text will be extracted and displayed in the "Input Document"
            window, and Tokens will be calculated. Adjust the Input Chunk Size and Output Summary Size sliders to modify how the chain query
            is performed. You can modify the Prompt templates and get additional debugging data from the other tabs.<p>
            "Tokens" is the token count of the uploaded file. When you upload a file, a token ratio is calculated as Tokens / 12,500.
            The instruction for Summarizing will uses an Input/Output token ratio of 2.5 (default) or Tokens / 12,500 to ensure that
            the total input tokens to the Final prompt is around or under 13,000. Summarize prompts are performed by gpt-3.5-turbo. Input Chunk Sizes above
            2500 can exceed the allowable context limit depending on your prompts. Final prompts are performed by gpt-3.5-turbo-16k. Depending on total input
            tokens, Output Summary Size around 3000 tokens should be totally fine, although most of the time, the output is under 1000 tokens.<p>

            Given this equation, the Summarizer can process near-arbitrary sized documents into approximately 12,000 input tokens, then convert that to 1-2k output
            summaries. Think about those loss-ratios when devising your prompts and expectations around results.
            """)

# State Variables for managing things

      with gr.Tab("Query Customizer"):
        summary_save = gr.State(summary_template_default)
        final_save = gr.State(final_template_default)

# Basic Instructions for things
        query_box = gr.Textbox(lines=12,interactive=True, label="Custom Queries, made to order by YOU!", show_copy_button=True)
        with gr.Row():
          t_choose = gr.Radio(["Summarize", "Final"], scale=3, label="Try these Fresh Prompts!")
          t_choose.change(fn=set_template, inputs=[t_choose, summary_save, final_save], outputs=query_box)

#put some stuff in to save new values to "summary_template" and "final_template"
        with gr.Row():
          save_btn = gr.Button("Save", variant="primary", size="sm")
          save_btn.click(fn=custom_prompt, inputs=[t_choose, query_box, summary_save, final_save], outputs=[query_box, summary_save, final_save])
          clr2_btn = gr.Button("Restore Defaults", variant="stop", size="sm")
          clr2_btn.click(fn=restore_templates, inputs=[t_choose], outputs =[query_box, summary_save, final_save])
        gr.Markdown(
            """
            "Summarize" is the prompt used for each Chunk of the document.<p>
            "Final" is used to create the Output Summary.<p>
            Your edited queries *MUST* contain the {text} and {question} variables or they won't work.<p>

            Click on the Radio buttons to view the default Prompts. Both Prompt templates pass in two variables, {text}, and {question}.
            The {text} variable is the passed in document chunk, and {question} is the user-provided Question text from the 'Summarizer' tab.<p>
            You can modify the Prompts in this window and click "Save" to update the Prompt corresponding to the Radio Button selection, and that Prompt will be used when
            you click "Submit" from the 'Summarizer'' tab. Click "Restore Defaults"
            to revert back to the original default prompts. Keep in mind context limits and input chunk sizes as you modify these prompts.
            """)
      with gr.Tab("Debugging"):
        with gr.Row():
          summaries_tokens = gr.Textbox(scale=1, label="Summary Tokens")
        chunk_summaries = gr.Textbox(lines=15, label="Concatenated text of Intermediate summaries", show_copy_button=True)

# Output Query looks bad. Retry the final summary

        re_summarize = gr.Button("Rerun Final Summary", variant="primary")
        gr.Markdown(
            """
            Space for showing some intermediary stuff about how the Query process went. First is "Summarized Chunks", which is the individual chunk summarizations for
            evaluating how the Summarize query is performing, and also to see how much input data went into the Final query.<p>
            "Rerun Final Summary" will allow you to take the existing Summarized Chunks and redo just the Final prompt. You need to adjust the question and prompt template
            from the other tabs for now.
            """)
    with gr.Column(scale=1):
      output = gr.Textbox(lines=15, label="Output Box", show_copy_button=True)
      sub_btn.click(fn=summarize_it, inputs=[question, chunk_s, out_s, doc, summary_save, final_save, token_ratio], outputs=[output, chunk_summaries, summaries_tokens])
      re_summarize.click(fn=resummarize_it, inputs=[question, out_s, chunk_summaries, final_save], outputs=output)

if __name__ == "__main__":
    demo.queue().launch(share=True, debug=True)