In [109]:
# In[2]:
import PyPDF2
import os
import openai
from dotenv import load_dotenv


In [110]:
# In[3]:
def extract_section(pdf_path, start_page, end_page):
    """
    Extract text from a specific range of pages in a PDF.

    Args:
        pdf_path (str): Path to the PDF file.
        start_page (int): Starting page number (1-based indexing).
        end_page (int or None): Ending page number. If None, extracts till the end.

    Returns:
        str: Extracted text.
    """
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        extracted_text = ""
        
        # Handle case where end_page is None (to end of document)
        if end_page is None:
            end_page = len(pdf_reader.pages)
        
        # Loop through the pages from start_page to end_page
        for page_num in range(start_page - 1, end_page):  # PyPDF2 uses 0-based indexing
            try:
                page = pdf_reader.pages[page_num]
                page_text = page.extract_text()
                if page_text:
                    extracted_text += page_text + "\n"
                else:
                    print(f"Warning: No text found on page {page_num + 1}")
            except IndexError:
                print(f"Error: Page number {page_num + 1} is out of range.")
                break
        
        return extracted_text


In [111]:
# In[4]:
def save_to_text(file_name, text):
    """
    Save text to a file.

    Args:
        file_name (str): Name of the output text file.
        text (str): Text content to save.
    """
    with open(file_name, 'w', encoding='utf-8') as txt_file:
        txt_file.write(text)


In [112]:
# In[5]:
def read_and_chunk_text(file_path, chunk_size=5000):
    """
    Read text from a file and split it into chunks.

    Args:
        file_path (str): Path to the text file.
        chunk_size (int, optional): Size of each chunk in characters. Defaults to 5000.

    Returns:
        list: List of text chunks.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    # Split text into chunks
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    return chunks


In [113]:
# In[6]:
def clean_text(text):
    """
    Clean the input text.

    Args:
        text (str): Original text.

    Returns:
        str: Cleaned text.
    """
    # Insert text-cleaning logic here (e.g., removing special characters, extra whitespace)
    # For demonstration, we'll perform basic cleaning by stripping leading/trailing whitespace
    return text.strip()


In [114]:
# In[7]:
def process_chunks(chunks):
    """
    Process and clean each text chunk.

    Args:
        chunks (list): List of text chunks.

    Returns:
        list: List of cleaned text chunks.
    """
    cleaned_chunks = []
    for i, chunk in enumerate(chunks):
        print(f"Processing chunk {i+1}/{len(chunks)}...")
        cleaned_chunk = clean_text(chunk)
        cleaned_chunks.append(cleaned_chunk)
    return cleaned_chunks


In [115]:
# In[8]:
def print_chunks(processed_chunks):
    """
    Print the first 1000 characters of each processed chunk.

    Args:
        processed_chunks (list): List of cleaned text chunks.
    """
    for index, chunk in enumerate(processed_chunks):
        print(f"--- Printing chunk {index + 1} ---\n")
        print(chunk[:1000])  # Print first 1000 characters for brevity
        print('\n' + '--' * 50 + '\n')
#print_chunks(processed_chunks)


In [116]:
# In[9]:
def create_messages(chunk, system_prompt):
    """
    Create message list for OpenAI API.

    Args:
        chunk (str): Text chunk to summarize.
        system_prompt (str): System prompt defining the assistant's behavior.

    Returns:
        list: List of messages.
    """
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": chunk},
        {"role": "assistant", "content": "Please retain relevant information from the chunk."},

    ]


In [122]:
# In[10]:
def summarize_chunk(chunk, system_prompt, model="gpt-4o-mini"):
    """
    Summarize a text chunk using OpenAI.

    Args:
        chunk (str): Text chunk to summarize.
        system_prompt (str): System prompt for the assistant.
        model (str, optional): OpenAI model to use. Defaults to "gpt-4".

    Returns:
        str: Summary of the chunk.
    """
    try:
        response = openai.chat.completions.create(
            model=model,
            messages=create_messages(chunk, system_prompt),
            max_tokens=5000,  # Adjust as needed
            temperature=0.2,  # Adjust for creativity
            #stream=True,
        )
        summary = response.choices[0].message.content.strip()
        return summary
    except Exception as e:
        print(f"Error during summarization: {e}")
        return ""
    
#def summarize(url):
#    website = Website(url)
#    response = openai.chat.completions.create(
#        model = "gpt-4o-mini",
#        messages = messages_for(website)
#    )
#    return response.choices[0].message.content




In [126]:
# In[11]:
import time  # Ensure time module is imported

def summarize_chunks(processed_chunks, system_prompt, model="gpt-4"):
    """
    Summarize all processed text chunks while tracking time taken per chunk and total time.

    Args:
        processed_chunks (list): List of cleaned text chunks.
        system_prompt (str): System prompt for the assistant.
        model (str, optional): OpenAI model to use. Defaults to "gpt-4".

    Returns:
        tuple: A tuple containing:
            - summaries (list): List of summaries for each chunk.
            - chunk_times (list): List of time durations for each chunk.
            - total_time (float): Total time taken to summarize all chunks.
    """
    summaries = []
    chunk_times = []
    total_start_time = time.perf_counter()  # Start total timer

    for i, chunk in enumerate(processed_chunks):
        print(f"🔄 Summarizing chunk {i+1}/{len(processed_chunks)}...")
        start_time = time.perf_counter()  # Start timer for this chunk

        summary = summarize_chunk(chunk, system_prompt, model)
        summaries.append(summary)

        end_time = time.perf_counter()  # End timer for this chunk
        duration = end_time - start_time
        chunk_times.append(duration)

        print(f"✅ Chunk {i+1} summarized in {duration:.2f} seconds.\n")

    total_end_time = time.perf_counter()  # End total timer
    total_duration = total_end_time - total_start_time

    print(f"🎉 Summarized {len(summaries)} chunks in {total_duration:.2f} seconds total.")

    return summaries, chunk_times, total_duration


In [119]:
# In[12]:
def assemble_summaries(summaries):
    """
    Assemble all summaries into a single text.

    Args:
        summaries (list): List of summary strings.

    Returns:
        str: Assembled summaries.
    """
    return '\n\n'.join(summaries)


In [120]:
# In[13]:
def write_summaries(cleaned_summaries, output_file):
    """
    Write assembled summaries to a file.

    Args:
        cleaned_summaries (str): Assembled summaries.
        output_file (str): Name of the output file.
    """
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(cleaned_summaries)

In [95]:
# In[14]:
# Load environment variables from the specified .env file
env_path = 'C:/Users/MichaelJWirickJr/keys.env'  # Update this path if necessary
load_dotenv(env_path)

# Set the OpenAI API key
openai.api_key = os.getenv('OPENAI_API_KEY')

# Check if the API key is loaded correctly (optional for debugging)
if openai.api_key:
    print("✅ OpenAI API Key loaded successfully.")
else:
    print("❌ Error: OpenAI API Key not found. Please check your .env file.")


✅ OpenAI API Key loaded successfully.


In [96]:
# In[15]:
# Define the system prompt for summarization
system_prompt = (
    "You are an assistant that analyzes the contents of an emergency medicine book "
    "please do not summarize, instead, retain as much relevant text as possible, ignoring text that might be chapter or section related. "
)#    "Respond in markdown."


In [97]:
# In[16]:
# Path to the PDF file (Update the path to your local PDF file)
pdf_path = r'C:/Users/MichaelJWirickJr/Tintinallis_Emergency_Medicine/Tintinallis_Emergency_Medicine.pdf'

# Define the sections with their adjusted page ranges
sections = {
    'Section 1': (20, 74),
    'Section 2': (74, 88),
    'Section 3': (88, 130),
    'Section 4': (130, 186),
    'Section 5': (186, 208),
    'Section 6': (208, 270),
    'Section 7': (270, 298),
    'Section 8': (298, 332),
    'Section 9': (332, 438),
    'Section 10': (438, 508),
    'Section 11': (508, 596),
    'Section 12': (596, 646),
    'Section 13': (646, 664),
    'Section 14': (664, 704),
    'Section 15': (704, 758),
    'Section 16': (758, 792),
    'Section 17': (792, 806),
    'Section 18': (806, 862),
    'Section 19': (862, 896),
    'Section 20': (896, 926),
    'Section 21': (926, 938),
    'Section 22': (938, 946),
    'Index': (946, None)  # None means to go to the end of the document
}


In [98]:
# In[17]:
# Example: Process a specific section
section_name = 'Section 1'  # Change as needed
start_page, end_page = sections[section_name]

print(f"Selected {section_name} with pages {start_page} to {end_page if end_page else 'end of document'}")


Selected Section 1 with pages 20 to 74


In [99]:
# In[18]:
# Step 1: Extract text for the selected section
extracted_text = extract_section(pdf_path, start_page, end_page)

# Step 2: Save the extracted text to a plain text file
txt_file_name = f"{section_name}.txt"
save_to_text(txt_file_name, extracted_text)
print(f"✅ Extracted text saved to {txt_file_name}")


✅ Extracted text saved to Section 1.txt


In [100]:
# In[19]:
# Step 3: Read and chunk the extracted text file
chunks = read_and_chunk_text(txt_file_name)
print(f"✅ Text split into {len(chunks)} chunks.")

✅ Text split into 26 chunks.


In [77]:
# In[20]:
# Step 4: Process each chunk (e.g., cleaning)
processed_chunks = process_chunks(chunks)
print(f"✅ Processed {len(processed_chunks)} chunks.")

Processing chunk 1/26...
Processing chunk 2/26...
Processing chunk 3/26...
Processing chunk 4/26...
Processing chunk 5/26...
Processing chunk 6/26...
Processing chunk 7/26...
Processing chunk 8/26...
Processing chunk 9/26...
Processing chunk 10/26...
Processing chunk 11/26...
Processing chunk 12/26...
Processing chunk 13/26...
Processing chunk 14/26...
Processing chunk 15/26...
Processing chunk 16/26...
Processing chunk 17/26...
Processing chunk 18/26...
Processing chunk 19/26...
Processing chunk 20/26...
Processing chunk 21/26...
Processing chunk 22/26...
Processing chunk 23/26...
Processing chunk 24/26...
Processing chunk 25/26...
Processing chunk 26/26...
✅ Processed 26 chunks.


In [101]:
# Step 4: Process each chunk (e.g., cleaning)
#processed_chunks = process_chunks(chunks)
#print(f"✅ Processed {len(processed_chunks)} chunks.")

# Print the length of each processed chunk
for i, chunk in enumerate(processed_chunks, start=1):
    print(f"Length of chunk {i}: {len(chunk)}")


Length of chunk 1: 5000
Length of chunk 2: 5000
Length of chunk 3: 4999
Length of chunk 4: 5000
Length of chunk 5: 5000
Length of chunk 6: 4999
Length of chunk 7: 5000
Length of chunk 8: 5000
Length of chunk 9: 5000
Length of chunk 10: 5000
Length of chunk 11: 5000
Length of chunk 12: 5000
Length of chunk 13: 4999
Length of chunk 14: 5000
Length of chunk 15: 4999
Length of chunk 16: 4999
Length of chunk 17: 5000
Length of chunk 18: 5000
Length of chunk 19: 5000
Length of chunk 20: 5000
Length of chunk 21: 5000
Length of chunk 22: 5000
Length of chunk 23: 5000
Length of chunk 24: 5000
Length of chunk 25: 5000
Length of chunk 26: 3496


In [None]:
# In[21]:
# Optional: Print chunks for verification (prints first 1000 characters of each chunk)
print_chunks(processed_chunks)


In [None]:
# In[22]:
# Step 5: Summarize each chunk using OpenAI
#summaries = summarize_chunks(processed_chunks, system_prompt)
#print(f"✅ Summarized {len(summaries)} chunks.")

# In[22]:
# Step 5: Summarize each chunk using OpenAI with timing

# Start the summarization process and capture timing information
summaries, chunk_times, total_time = summarize_chunks(processed_chunks, system_prompt)

print(f"✅ Summarized {len(summaries)} chunks.")

# Optional: Display detailed timing information
print("\n--- Timing Details ---\n")
for idx, duration in enumerate(chunk_times, start=1):
    print(f"Chunk {idx}: {duration:.2f} seconds")
print(f"\n🔔 Total summarization time: {total_time:.2f} seconds")


In [127]:
# In[23]:
# Step 6: Assemble all summaries into a single text
assembled_summaries = assemble_summaries(summaries)
print("✅ Assembled all summaries into a single document.")


✅ Assembled all summaries into a single document.


In [128]:
# In[24]:
# Step 7: Save the assembled summaries to a new output file
output_file_name = f"{section_name}_summaries.md"
write_summaries(assembled_summaries, output_file_name)
print(f"✅ Summaries saved to {output_file_name}")


✅ Summaries saved to Section 1_summaries.md


In [None]:
# In[25]:
# Optional: Print the assembled summaries
print("\n--- Assembled Summaries ---\n")
print(assembled_summaries)


In [123]:
def qc():
    """
    this code block is for testing for one chunk.
    """
    pass 
# In[22]:
# Step 5: Summarize a single chunk using OpenAI

# Select the first chunk for testing
test_chunk = processed_chunks[0]  # Change the index if you want to test a different chunk

# Summarize the selected chunk
summary = summarize_chunk(test_chunk, system_prompt)

# Display the summary
print("✅ Summary of the Test Chunk:")
print(summary)


✅ Summary of the Test Chunk:
Control of the airway is the single most important task for emergency resuscitation.

The initial approach to airway management involves simultaneous assessment and management of airway patency and oxygenation and ventilation. 

1. Assess the patient’s color and respiratory rate; respiratory or cardiac arrest may indicate immediate intubation.
2. Open the airway with the head tilt–chin lift maneuver (use jaw thrust if C-spine injury is suspected). If needed, bag the patient with a bag-valve-mask device that includes an O2 reservoir. A good seal depends on proper mask size. This technique may require an oral or nasal airway or two rescuers (one to seal the mask with two hands and the other to bag the patient).
3. Provide continuous monitoring of vital signs, oxygen saturation, and end-tidal CO2 (if possible).
4. Determine the need for invasive airway management techniques. Do not wait for arterial blood gas analyses if the initial assessment indicates the ne