In [None]:
!pip install openai
!pip install faiss-cpu
!pip install scikit-learn

import os
import json
import nltk
from typing import List, Dict
from google.colab import files
from openai import OpenAI
import numpy as np
import faiss
from sklearn.preprocessing import normalize
import re

from google.colab import userdata

# Download necessary NLTK data
nltk.download('punkt')

# Set up OpenAI client (replace with your actual API key)
client = OpenAI(api_key=userdata.get('openai_key'))

# Reuse functions from the previous script
def upload_file():
    """Upload a file to Google Colab."""
    uploaded = files.upload()
    return next(iter(uploaded))

def read_pdf(file_path: str) -> str:
    """Read text from a PDF file."""
    # Note: In Colab, you might need to install PyPDF2 or another PDF reader
    !pip install PyPDF2
    import PyPDF2

    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

def clean_text(text: str) -> str:
    """Clean the extracted text."""
    # Replace Unicode characters with their ASCII equivalents
    text = text.encode('ascii', 'ignore').decode('ascii')
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def split_into_sentences(text: str) -> List[str]:
    """Split the text into sentences."""
    return nltk.sent_tokenize(text)

def create_chunks(sentences: List[str], chunk_size: int = 10) -> List[str]:
    """Create fixed-size chunks from sentences."""
    chunks = []
    for i in range(0, len(sentences), chunk_size):
        chunk = ' '.join(sentences[i:i+chunk_size])
        chunks.append(chunk)
    return chunks

def get_initial_context(text: str) -> Dict:
    """Get initial context for the entire document."""
    prompt = f"""Analyze the following text and provide:
    1. Main topics (abstract concepts, themes, or subject areas)
    2. Key entities (specific people, companies, organizations, or locations)
    3. Overall sentiment
    4. Brief summary

    Ensure that the main topics and key entities are distinct categories without overlap.

    Text: {text[:10000]}  # Limiting to first 10000 characters

    Respond in JSON format. You are part of a bigger system, and you must always respond with a pure JSON response. The system will break if you don't. NEVER add a JSON code block syntax to your response"""

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that analyzes text and extracts key information."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.3,
    )

    return json.loads(response.choices[0].message.content)


def process_chunk(chunk: str, global_context: Dict, previous_chunks_metadata: List[Dict]) -> Dict:
    """Process a single chunk and extract metadata."""
    prompt = f"""Analyze the following text chunk and provide metadata according to the specified format.
    Use the provided global context and previous chunk metadata for consistency.

    Global Context: {json.dumps(global_context)}

    Previous Chunks Metadata: {json.dumps(previous_chunks_metadata[-3:] if previous_chunks_metadata else [])}

    Text Chunk: {chunk}

    Provide the following metadata in JSON format. Ensure that each category is distinct and there is no overlap between them:

    1. concepts: Abstract ideas or themes discussed in the text (max 3)
    2. subjects: Specific areas or fields of study mentioned (max 3)
    3. topics: Particular issues or matters being discussed (max 3)
    4. people: Names of individuals mentioned
    5. dates: Any dates or time periods referenced
    6. organizations: Names of companies, institutions, or groups
    7. locations: Any places or geographical areas mentioned
    8. title: The title of the article or document (if applicable)
    9. author: The name of the author(s) (if mentioned)
    10. citations: Any references, citations, or hyperlinks
    11. sentiment: Overall sentiment of the chunk (positive, negative, or neutral)
    12. key_sentences: 1-2 important sentences that summarize main points
    13. entity_relationships: Relationships between entities in the format {{"subject": "entity1", "relationship": "verb", "object": "entity2"}}

    You are part of a bigger system, and you must always respond with a pure JSON response. The system will break if you don't. NEVER add a JSON code block syntax to your response."""

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that analyzes text chunks and extracts detailed metadata."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.3,
    )

    return json.loads(response.choices[0].message.content)

def process_document(text: str, chunk_size: int = 10) -> List[Dict]:
    """Process the entire document."""
    cleaned_text = clean_text(text)
    sentences = split_into_sentences(cleaned_text)
    chunks = create_chunks(sentences, chunk_size)

    global_context = get_initial_context(cleaned_text)

    processed_chunks = []
    for chunk in chunks:
        chunk_metadata = process_chunk(chunk, global_context, processed_chunks)
        processed_chunks.append({
            "original_text": chunk,
            "metadata": chunk_metadata
        })

    return processed_chunks


def get_embedding(text: str) -> np.ndarray:
    """Get embedding for a given text using OpenAI's API."""
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=text
    )
    return np.array(response.data[0].embedding)

def create_chunked_vector_db(processed_chunks: List[Dict]) -> tuple:
    embeddings = []
    for chunk in processed_chunks:
        # Convert metadata dictionary to a string
        metadata_str = json.dumps(chunk)
        embedding = get_embedding(metadata_str)
        embeddings.append(embedding)

    embeddings = np.array(embeddings)
    embeddings = normalize(embeddings)

    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)

    return index, embeddings

def create_fulltext_vector_db(text: str) -> tuple:
    """Create a vector database from the full text."""
    sentences = nltk.sent_tokenize(text)
    embeddings = [get_embedding(sentence) for sentence in sentences]
    embeddings = np.array(embeddings)
    embeddings = normalize(embeddings)

    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)

    return index, embeddings, sentences

# Update the perform_search function to handle both chunked and full-text cases
def perform_search(query: str, index: faiss.IndexFlatL2, embeddings: np.ndarray, data: List[Dict] = None, k: int = 5) -> List[tuple]:
    query_vector = get_embedding(query)
    query_vector = normalize(query_vector.reshape(1, -1))

    distances, indices = index.search(query_vector, k)
    results = []
    for idx, distance in zip(indices[0], distances[0]):
        if data:  # For chunked DB
            metadata = data[idx]['metadata']
            results.append((idx, distance, metadata))
        else:  # For full-text DB
            results.append((idx, distance))
    return results

# Update the comparison function
def compare_performance(processed_chunks: List[Dict], full_text: str, queries: List[str]):
    chunked_db, chunked_embeddings = create_chunked_vector_db(processed_chunks)
    fulltext_db, fulltext_embeddings, sentences = create_fulltext_vector_db(full_text)

    print("Comparison Results:")
    print("-" * 50)

    for query in queries:
        print(f"Query: {query}")

        chunked_results = perform_search(query, chunked_db, chunked_embeddings, processed_chunks)
        print("Chunked DB - Top 5 results:")
        for idx, score, metadata in chunked_results:
            print(f"Chunk {idx} (Score: {score:.4f}):")
            print(f"Concepts: {metadata['concepts']}")
            print(f"Key Sentences: {metadata['key_sentences']}")

        fulltext_results = perform_search(query, fulltext_db, fulltext_embeddings)
        print("\nFull-text DB - Top 5 results:")
        for idx, score in fulltext_results:
            print(f"Sentence {idx} (Score: {score:.4f}): {sentences[idx]}")

        print("-" * 50)
# The main function remains the same

def main():
    print("Please upload a PDF file.")
    file_name = upload_file()

    print(f"Processing {file_name}...")
    text = read_pdf(file_name)

    processed_chunks = process_document(text)

    output_file = f"{os.path.splitext(file_name)[0]}_processed.json"
    with open(output_file, 'w') as f:
        json.dump(processed_chunks, f, indent=2)

    print(f"Processing complete. Results saved to {output_file}")
    files.download(output_file)

    # Perform vector database comparison
    queries = [
        "What is the main topic of the document?",
        "Who are the key people mentioned?",
        "What are the main conclusions or findings?"
    ]

    compare_performance(processed_chunks, text, queries)

if __name__ == "__main__":
    main()

Collecting openai
  Downloading openai-1.44.1-py3-none-any.whl.metadata (22 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading openai-1.44.1-py3-none-any.whl (373 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m373.5/373.5 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx-0.27.2-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K   [90m━━

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Please upload a PDF file.


Saving graham.pdf to graham.pdf
Processing graham.pdf...
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Processing complete. Results saved to graham_processed.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Comparison Results:
--------------------------------------------------
Query: What is the main topic of the document?
Chunked DB - Top 5 results:
Chunk 0 (Score: 1.4893):
Concepts: ['Founder Mode', 'Manager Mode', 'Leadership']
Key Sentences: ["The theme of Brian's talk was that the conventional wisdom about how to run larger companies is mistaken.", 'He had to figure out a better way on his own, which he did partly by studying how Steve Jobs ran Apple.']
Chunk 5 (Score: 1.5068):
Concepts: ['Founder Mode', 'Delegation', 'Company Growth']
Key Sentences: ["So is it a good idea, or a bad one? We still don't know.", 'Founder mode will be more complicated than manager mode.']
Chunk 7 (Score: 1.5189):
Concepts: ['Founder Mode', 'CEO Impact', 'Modular Approach']
Key Sentences: ['The modular approach does at least limit the damage a bad CEO can do.', 'Thanks to Brian Chesky, Patrick Collison, Ron Conway, Jessica Livingston, Elon Musk, Ryan Petersen, Harj Taggar, and Garry Tan for reading draft