In [3]:
pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m225.3/232.6 kB[0m [31m8.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [4]:
pip install google-generativeai



In [6]:
pip install chromadb


Collecting chromadb
  Downloading chromadb-1.0.13-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-6.0.1-py3-none-any.whl.metadata (6.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.34.1-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.34.1-py3-none-any.whl.metadata (2.4 kB)
Collecting opentelemetry-sdk>=1.2.0 (from chromadb)
  Downloading opentelemetry_sdk-1.34.1-py3-none-any.whl.metadata (1.6 kB)
Coll

In [7]:
pip install "chromadb[recommended]" # 推薦安裝完整版本，包含更多依賴



In [20]:
import PyPDF2
import re
import chromadb
import google.generativeai as genai
from chromadb.utils import embedding_functions
import os

# Ensure your API Key is set as an environment variable GOOGLE_API_KEY
# Here we assume you have set the API Key as an environment variable
# If not, add os.environ["GOOGLE_API_KEY"] = "YOUR API Key" at the top of the code

genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
gemini_ef = embedding_functions.GoogleGenerativeAiEmbeddingFunction(api_key=os.environ["GOOGLE_API_KEY"], model_name="models/embedding-001")

def extract_text_from_pdf(pdf_path):
    """
    Extracts all text from the specified PDF file.
    Args:
        pdf_path (str): The path to the PDF file.
    Returns:
        str: All text extracted from the PDF, or None if an error occurs.
    """
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                extracted_page_text = page.extract_text()
                if extracted_page_text:
                    text += extracted_page_text + "\n\n"
            return text
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return None

def split_text_into_chunks_fixed_length(text, chunk_size=300, chunk_overlap=50):
    """
    Splits text into fixed-length chunks with overlap.
    Args:
        text (str): The complete text string.
        chunk_size (int): The maximum number of characters per text chunk.
        chunk_overlap (int): The number of overlapping characters between adjacent text chunks.
    Returns:
        list: A list of strings, where each string is a text chunk.
    """
    if not text:
        return []

    cleaned_text = text.replace('\n', ' ').replace('\r', ' ').strip()
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text) # Replace multiple spaces with a single space

    chunks = []
    start_index = 0
    while start_index < len(cleaned_text):
        end_index = start_index + chunk_size
        if end_index > len(cleaned_text):
            chunks.append(cleaned_text[start_index:])
            break

        # Try to split at the end of a sentence to avoid truncating sentences
        split_point = cleaned_text.rfind('.', start_index, end_index)
        if split_point == -1:
            split_point = cleaned_text.rfind('。', start_index, end_index)
        if split_point == -1:
             split_point = cleaned_text.rfind(' ', start_index, end_index)


        if split_point > start_index + chunk_overlap:
            current_chunk = cleaned_text[start_index:split_point + 1].strip()
            chunks.append(current_chunk)
            start_index = split_point + 1 - chunk_overlap
        else:
            current_chunk = cleaned_text[start_index:end_index].strip()
            chunks.append(current_chunk)
            start_index += chunk_size - chunk_overlap

        start_index = max(0, start_index)

    return [chunk for chunk in chunks if chunk]


def create_chroma_db(documents, db_path="./chroma_db", collection_name="pdf_qa_collection"):
    """
    Creates and populates a ChromaDB database.
    Args:
        documents (list): A list containing the text chunks to be stored.
        db_path (str): The path where the ChromaDB database will be stored.
        collection_name (str): The name of the database collection.
    Returns:
        chromadb.api.models.Collection.Collection: The created or retrieved ChromaDB collection.
    """
    try:
        # Initialize ChromaDB client
        # persist_directory makes the ChromaDB data persistent on disk
        client = chromadb.PersistentClient(path=db_path)

        # Create or get the collection
        # The embedding_function parameter tells ChromaDB how to generate embedding vectors
        collection = client.get_or_create_collection(
            name=collection_name,
            embedding_function=gemini_ef # Use our defined Gemini embedding function
        )

        # Generate unique IDs for each document
        ids = [f"doc_{i}" for i in range(len(documents))]

        # Add documents and their embeddings to the collection
        # ChromaDB will automatically generate embeddings for each document using the embedding_function
        collection.add(
            documents=documents,
            ids=ids
        )
        print(f"Successfully added {len(documents)} documents to ChromaDB collection '{collection_name}'.")
        print(f"Database storage path: {db_path}")
        return collection

    except Exception as e:
        print(f"Error creating or populating ChromaDB: {e}")
        return None

# --- Integrate all steps and execute ---

if __name__ == "__main__":
    pdf_file_path = "AI Team2 7.4報告.pdf"

    # 1. Extract text from PDF
    full_pdf_text = extract_text_from_pdf(pdf_file_path)

    if full_pdf_text:
        print("Successfully extracted text from PDF.")

        # 2. Split text into chunks
        chunks = split_text_into_chunks_fixed_length(full_pdf_text, chunk_size=300, chunk_overlap=50)
        print(f"Split into {len(chunks)} text chunks.")

        # 3. Create and populate ChromaDB
        # You can customize db_path and collection_name
        chroma_collection = create_chroma_db(documents=chunks,
                                             db_path="./my_pdf_chroma_db",
                                             collection_name="ai_team2_report_qa")

        if chroma_collection:
            print("ChromaDB collection is ready.")
            # You can perform a simple query to verify
            print("\nPerforming a simple query to verify the database:")
            results = chroma_collection.query(
                query_texts=["What is Chinese medicine used for?"], # Your query question
                n_results=2 # Return the top 2 most similar results
            )
            print(results)
        else:
            print("Failed to create ChromaDB collection.")
    else:
        print("Failed to extract text from PDF.")

ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


Successfully extracted text from PDF.
Split into 9 text chunks.
Successfully added 9 documents to ChromaDB collection 'ai_team2_report_qa'.
Database storage path: ./my_pdf_chroma_db
ChromaDB collection is ready.

Performing a simple query to verify the database:
{'ids': [['doc_0', 'doc_2']], 'embeddings': None, 'documents': [['AI Identiﬁcation of Chinese Medicine Members : Hung Lung-Chen, Yu Pin-Yi, Hsieh Ching-Hung, Chen Kai-Jin Team2 Project : Introduction What is Chinese medicine? ●Uses herbs based on traditional Chinese medical theory. ●Comes from plants , animals , and minerals .', 'ypes of Chinese medicinal herbs look very similar. ●Chinese medicine is becoming increasingly popular worldwide. ● Key Features Advantage ●Quickly and accurately obtain information about Chinese medicine. ●Provides an additional option to help protect your health and defend against diseases .']], 'uris': None, 'included': ['metadatas', 'documents', 'distances'], 'data': None, 'metadatas': [[None, None]