In [44]:
pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━[0m [32m174.1/232.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [52]:
import PyPDF2
import re

def extract_text_from_pdf(pdf_path):
    """
    Extracts all text from the specified PDF file.
    Args:
        pdf_path (str): The path to the PDF file.
    Returns:
        str: All text extracted from the PDF, or None if an error occurred.
    """
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                extracted_page_text = page.extract_text()
                if extracted_page_text:
                    # Attempt to add two newlines after each page's text to help with segmentation
                    text += extracted_page_text + "\n\n"
            return text
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return None

def split_text_into_chunks_fixed_length(text, chunk_size=500, chunk_overlap=100):
    """
    Splits text into chunks of fixed length with overlap.
    Args:
        text (str): The complete text string.
        chunk_size (int): The maximum number of characters per text chunk.
        chunk_overlap (int): The number of overlapping characters between adjacent chunks.
    Returns:
        list: A list of strings, where each string is a text chunk.
    """
    if not text:
        return []

    # Clean the text: remove extra whitespace, form feeds, etc.
    # Replace all newlines with a single space, then handle extra spaces
    cleaned_text = text.replace('\n', ' ').replace('\r', ' ').strip()
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text) # Replace multiple spaces with a single space

    chunks = []
    start_index = 0
    while start_index < len(cleaned_text):
        end_index = start_index + chunk_size
        if end_index > len(cleaned_text):
            chunks.append(cleaned_text[start_index:])
            break

        # Attempt to cut at the end of a sentence to avoid truncating sentences
        # Find the nearest period, question mark, or exclamation point near chunk_size
        split_point = cleaned_text.rfind('.', start_index, end_index)
        if split_point == -1: # If no period is found in the current chunk
            split_point = cleaned_text.rfind('。', start_index, end_index) # Try Chinese period
        if split_point == -1: # If still not found, look for a space within the overlap range
             split_point = cleaned_point = cleaned_text.rfind(' ', start_index, end_index)


        if split_point > start_index + chunk_overlap: # Ensure the split point is within a reasonable range
            current_chunk = cleaned_text[start_index:split_point + 1].strip()
            chunks.append(current_chunk)
            start_index = split_point + 1 - chunk_overlap
        else:
            # If there's no good split point, just split by chunk_size
            current_chunk = cleaned_text[start_index:end_index].strip()
            chunks.append(current_chunk)
            start_index += chunk_size - chunk_overlap

        # Ensure start_index does not become negative
        start_index = max(0, start_index)

    # Filter out any potentially empty chunks
    return [chunk for chunk in chunks if chunk]

# Example usage:
pdf_file_path = "AI Team2 7.4報告.pdf" # Use the PDF file name you provided
full_pdf_text = extract_text_from_pdf(pdf_file_path)

if full_pdf_text:
    print("Successfully extracted text from PDF.")

    # Here we use fixed-length splitting and specify chunk size and overlap
    # You can adjust chunk_size and chunk_overlap based on your PDF content and needs
    chunks = split_text_into_chunks_fixed_length(full_pdf_text, chunk_size=300, chunk_overlap=50)

    print(f"Total {len(chunks)} text chunks split.")
    print("\n--- First 3 text chunks ---")
    for i, chunk in enumerate(chunks[:3]):
        print(f"Text chunk {i+1}:\n{chunk}\n---")

    print("\n--- Last 3 text chunks ---")
    for i, chunk in enumerate(chunks[-3:]):
        print(f"Text chunk {len(chunks) - 3 + i + 1}:\n{chunk}\n---")

    # You can choose to store these chunks in a variable for subsequent tasks
    # segmented_text_chunks = chunks
else:
    print("Failed to extract text from PDF.")



Successfully extracted text from PDF.
Total 9 text chunks split.

--- First 3 text chunks ---
Text chunk 1:
AI Identiﬁcation of Chinese Medicine Members : Hung Lung-Chen, Yu Pin-Yi, Hsieh Ching-Hung, Chen Kai-Jin Team2 Project : Introduction What is Chinese medicine? ●Uses herbs based on traditional Chinese medical theory. ●Comes from plants , animals , and minerals .
---
Text chunk 2:
ory. ●Comes from plants , animals , and minerals . ●Primary purpose is disease prevention and maintaining health . Introduction Our solution ●Use AI to identify Chinese medicinal herbs accurately . Why are we doing this project? ●Many types of Chinese medicinal herbs look very similar.
---
Text chunk 3:
ypes of Chinese medicinal herbs look very similar. ●Chinese medicine is becoming increasingly popular worldwide. ● Key Features Advantage ●Quickly and accurately obtain information about Chinese medicine. ●Provides an additional option to help protect your health and defend against diseases .
---

--- Las