This notebook creates a **30-minute Summary** from raw chapter text using OpenAI's API.



In [None]:
#@markdown # Enter the required information for the chapter summarization process.
#@markdown ---

#@markdown **Step 1:** Enter API Key and ISBN

open_api_key = '' #@param {type:"string"}
isbn = '' #@param {type:"string"}



In [None]:
!pip install openai
!pip install openai transformers
import json
import time
import re
import io
from openai import OpenAI
from transformers import GPT2TokenizerFast
from google.colab import files
import concurrent.futures
from concurrent.futures import ProcessPoolExecutor

Collecting openai
  Downloading openai-1.23.6-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.6/311.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: h11, httpcore, httpx, openai
Successfully installed h11-0.14.0 httpcore-1.0.5 ht

In [None]:
#@markdown **Step 2:** Upload Chapter Extracts File

#@markdown Click on the **"Choose Files"** button to upload your chapter extracts file.

#@markdown The file should be in JSON format with the following structure for each chapter:

#@markdown ```
#@markdown {
#@markdown   "isbn": "ISBN-10",
#@markdown   "name": "Chapter Name",
#@markdown   "sequence_index": index,
#@markdown   "contents": "Chapter text",
#@markdown   "part": "Chapter part number",
#@markdown }
#@markdown ```
#@markdown Once the file is selected, it will be automatically uploaded.

uploaded_files = files.upload()

if not uploaded_files:
    raise Exception("File upload failed, please try again.")
else:
    # Taking only the first uploaded file
    uploaded_filename = next(iter(uploaded_files))
    uploaded_file = uploaded_files[uploaded_filename]

    # Reading the file
try:
    data = json.load(io.BytesIO(uploaded_file))
    # Check that all necessary keys are present
    if not all(key in chapter for chapter in data for key in ["isbn", "name", "sequence_index", "contents", "part"]):
        raise ValueError("Some chapters do not contain all required keys.")
    print(f"File '{uploaded_filename}' successfully uploaded and read.")
except (json.JSONDecodeError, ValueError) as e:
    print(f"An error occurred while reading the file: {e}")
    raise


Saving 1626813582_autosplits.json to 1626813582_autosplits.json
File '1626813582_autosplits.json' successfully uploaded and read.


In [None]:
#@markdown **Step 3:** Generate Summary and Download Processed Data

#@markdown Click the **"Play"** button on the left of this cell to generate the chapter summaries and download the processed data.

# Gets the token count using GPT-2 tokenizer
def get_token_count(text):
    tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
    tokens = tokenizer(text)["input_ids"]
    return len(tokens)

def prompt_model(text, model="gpt-3.5-turbo", open_api_key=open_api_key):
    try:
        start = time.time()
        client = OpenAI(api_key=open_api_key)
        print("Prompting model...")
        chat_completion = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": text}]
        )
        response = chat_completion.choices[0].message.content
        print("Response received -- Time taken: {:.2f} seconds".format(time.time() - start))
        return response
    except Exception as e:
        print(f"Failed to prompt model: {e}")
        raise

def prompt_response_to_pov(response):
    response.lower()
    if "first" in response:
        return "first person"
    elif "second" in response:
        return "second person"
    else:
        return "third person"

def determine_pov(text):
    model = 'gpt-4-1106-preview'
    prompt = f'''
    You are a editor. Determine the point of view of the text enclosed in backticks (```)
    Respond with one of the following: first person, second person, or third person.

    ```{text}```
    '''
    result = prompt_model(prompt, model=model)
    pov = prompt_response_to_pov(result)
    return pov

def summarize(text, chapter_name):
    model = 'gpt-4-1106-preview'
    prompt = f'''

    You are a professional writer. You will be given a chapter that is titled {chapter_name}.
    Your goal is to write a condensed version such that a reader will have a full understanding of the original writing.

    * Your condensed version should be in the same point of view as the original text.
    * Your condensed version should fully encapsulate what was said in the original text.
    * Your condensed version should only include paragraphs.

    ```{text}```

    CONDENSED VERSION:
    '''

    result = prompt_model(prompt, model=model)
    return result

def remove_leading_and_trailing_fluff(text):
    model = 'gpt-4-turbo-preview'
    prompt = f'''

    You are a professional editor. You will be given a summary of a chapter from some book.

    * Remove the leading and trailing backticks (```) from the text if they exist.
    * Remove any leading or trailing whitespace from the text.
    * Remove any leading title or leading chapter name from the text.

    ```{text}```

    YOUR EDITED TEXT:
    '''

    result = prompt_model(prompt, model=model)
    return result

def rewrite_in_pov(text, pov):
    model = 'gpt-4-1106-preview'
    prompt = f'''

    You are a professional editor. You will be given a summary of a chapter from some book.

    * Rewrite the summary in the point of view of {pov}.

    ```{text}```

    YOUR REWRITTEN TEXT:
    '''

    result = prompt_model(prompt, model=model)
    return result

def summarize_and_clean(chapter):
    summary = summarize(chapter["contents"], chapter["name"])
    cleaned_summary = remove_leading_and_trailing_fluff(summary)
    #pov = determine_pov(chapter["content"])
    #print(f"Rewriting in: {pov}")
    #rewritten_summary = rewrite_in_pov(cleaned_summary, pov)
    return cleaned_summary

def process_chapter(chapter):
    # Try to process a single chapter and handle possible exceptions
    try:
        return summarize_and_clean(chapter)
    except Exception as e:
        print(f"Error processing chapter: {e}")
        return None

def generate_chapter_by_chapter_summary(data):
    with ProcessPoolExecutor(max_workers=8) as executor:
        futures = {executor.submit(process_chapter, chapter): idx for idx, chapter in enumerate(data)}
        for future in concurrent.futures.as_completed(futures):
            idx = futures[future]
            try:
                summary = future.result()
                data[idx]["chapter_by_chapter"] = summary
                print(f"Chapter {idx} processed successfully")
            except Exception as e:
                print(f"Failed to process chapter {idx}: {e}")
    return data

# Call the function to process the data
processed_data = generate_chapter_by_chapter_summary(data)

def count_chapter_lengths(data):
    for chapter in data:
        len_words = get_token_count(chapter['contents'])
        print(f"Chapter {chapter['name']} has {len_words} tokens.")

count_chapter_lengths(processed_data)

# Save the processed data to a file and offer it for the user to download
output_filename = f"{isbn}_thirty_minute_read.json"
with open(output_filename, "w") as output_file:
    json.dump(processed_data, output_file, ensure_ascii=False, indent=4)

files.download(output_filename)


  self.pid = os.fork()


Prompting model...
Prompting model...
Prompting model...Prompting model...

Prompting model...
Prompting model...
Prompting model...Prompting model...

Response received -- Time taken: 17.18 seconds
Prompting model...
Response received -- Time taken: 18.25 seconds
Prompting model...
Response received -- Time taken: 18.41 seconds
Prompting model...
Response received -- Time taken: 19.52 seconds
Prompting model...
Response received -- Time taken: 19.90 seconds
Prompting model...
Response received -- Time taken: 21.65 seconds
Prompting model...
Response received -- Time taken: 24.76 seconds
Prompting model...
Response received -- Time taken: 26.83 seconds
Prompting model...
Response received -- Time taken: 15.08 seconds
Prompting model...
Chapter 4 processed successfully
Response received -- Time taken: 18.30 seconds
Prompting model...
Chapter 5 processed successfully
Response received -- Time taken: 17.57 seconds
Prompting model...
Chapter 1 processed successfully
Response received -- Ti

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2208 > 1024). Running this sequence through the model will result in indexing errors


Chapter Foreword has 2208 tokens.


Token indices sequence length is longer than the specified maximum sequence length for this model (1849 > 1024). Running this sequence through the model will result in indexing errors


Chapter Exponential Organizations has 1849 tokens.


Token indices sequence length is longer than the specified maximum sequence length for this model (4582 > 1024). Running this sequence through the model will result in indexing errors


Chapter Chapter One: Illuminated by Information has 4582 tokens.


Token indices sequence length is longer than the specified maximum sequence length for this model (6352 > 1024). Running this sequence through the model will result in indexing errors


Chapter Chapter Two: A Tale of Two Companies has 6352 tokens.
Chapter Chapter Three: The Exponential Organization has 513 tokens.


Token indices sequence length is longer than the specified maximum sequence length for this model (2138 > 1024). Running this sequence through the model will result in indexing errors


Chapter Massive Transformative Purpose (MTP) has 2138 tokens.


Token indices sequence length is longer than the specified maximum sequence length for this model (1808 > 1024). Running this sequence through the model will result in indexing errors


Chapter Staff on Demand has 1808 tokens.


Token indices sequence length is longer than the specified maximum sequence length for this model (1687 > 1024). Running this sequence through the model will result in indexing errors


Chapter Community & Crowd has 1687 tokens.


Token indices sequence length is longer than the specified maximum sequence length for this model (2469 > 1024). Running this sequence through the model will result in indexing errors


Chapter Algorithms has 2469 tokens.


Token indices sequence length is longer than the specified maximum sequence length for this model (3653 > 1024). Running this sequence through the model will result in indexing errors


Chapter Engagement has 3653 tokens.


Token indices sequence length is longer than the specified maximum sequence length for this model (1995 > 1024). Running this sequence through the model will result in indexing errors


Chapter Dashboards has 1995 tokens.


Token indices sequence length is longer than the specified maximum sequence length for this model (3068 > 1024). Running this sequence through the model will result in indexing errors


Chapter Experimentation has 3068 tokens.


Token indices sequence length is longer than the specified maximum sequence length for this model (2408 > 1024). Running this sequence through the model will result in indexing errors


Chapter Autonomy has 2408 tokens.


Token indices sequence length is longer than the specified maximum sequence length for this model (3059 > 1024). Running this sequence through the model will result in indexing errors


Chapter Social Technologies has 3059 tokens.
Chapter Chapter Five: Implications of Exponential Organizations has 400 tokens.


Token indices sequence length is longer than the specified maximum sequence length for this model (1700 > 1024). Running this sequence through the model will result in indexing errors


Chapter 1. Information Accelerates Everything has 1700 tokens.


Token indices sequence length is longer than the specified maximum sequence length for this model (1905 > 1024). Running this sequence through the model will result in indexing errors


Chapter 6. Smaller Beats Bigger has 1905 tokens.


Token indices sequence length is longer than the specified maximum sequence length for this model (1712 > 1024). Running this sequence through the model will result in indexing errors


Chapter 9. Everything is Measurable and Anything is Knowable has 1712 tokens.
Chapter Chapter Six: Starting an ExO has 861 tokens.


Token indices sequence length is longer than the specified maximum sequence length for this model (1544 > 1024). Running this sequence through the model will result in indexing errors


Chapter Ignition has 1544 tokens.


Token indices sequence length is longer than the specified maximum sequence length for this model (1691 > 1024). Running this sequence through the model will result in indexing errors


Chapter Step 4: Breakthrough Idea has 1691 tokens.


Token indices sequence length is longer than the specified maximum sequence length for this model (2085 > 1024). Running this sequence through the model will result in indexing errors


Chapter Example 2: GitHub has 2085 tokens.


Token indices sequence length is longer than the specified maximum sequence length for this model (2356 > 1024). Running this sequence through the model will result in indexing errors


Chapter Example 5: GoPro has 2356 tokens.


Token indices sequence length is longer than the specified maximum sequence length for this model (1936 > 1024). Running this sequence through the model will result in indexing errors


Chapter Chapter Eight: ExOs for Large Organizations has 1936 tokens.


Token indices sequence length is longer than the specified maximum sequence length for this model (1621 > 1024). Running this sequence through the model will result in indexing errors


Chapter 2. Partner with, Invest in or Acquire ExOs has 1621 tokens.


Token indices sequence length is longer than the specified maximum sequence length for this model (1654 > 1024). Running this sequence through the model will result in indexing errors


Chapter Inspire ExOs at the Edge has 1654 tokens.


Token indices sequence length is longer than the specified maximum sequence length for this model (1850 > 1024). Running this sequence through the model will result in indexing errors


Chapter Partner with Accelerators, Incubators and Hackerspaces has 1850 tokens.


Token indices sequence length is longer than the specified maximum sequence length for this model (2558 > 1024). Running this sequence through the model will result in indexing errors


Chapter Chapter Ten: The Exponential Executive has 2558 tokens.


Token indices sequence length is longer than the specified maximum sequence length for this model (4066 > 1024). Running this sequence through the model will result in indexing errors


Chapter Epilogue: A New Cambrian Explosion has 4066 tokens.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>