In [None]:
import pandas as pd
import anthropic
import time
import re
from tqdm import tqdm
import json
import PyPDF2
from langchain.text_splitter import RecursiveCharacterTextSplitter

# 1️⃣ Claude API client
client = anthropic.Anthropic(
    api_key="*****"
)

# 2️⃣ PDF Processing Function
def extract_text_from_pdf(pdf_path):
    """Extract text from PDF file"""
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        full_text = ""
        
        for page_num, page in enumerate(pdf_reader.pages):
            page_text = page.extract_text()
            full_text += f"\n--- Page {page_num + 1} ---\n{page_text}"
    
    return full_text

# 3️⃣ Text Chunking Function  
def chunk_text(text, chunk_size=1000, chunk_overlap=200):
    """Split text into overlapping chunks"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ".", "!", "?", ";", ":", ",", " ", ""]
    )
    
    chunks = text_splitter.split_text(text)
    
    # Create DataFrame with chunk information
    chunk_data = []
    for i, chunk in enumerate(chunks):
        chunk_data.append({
            "doc_ID": "research_paper",
            "chunk_ID": f"chunk_{i+1:03d}",
            "chunk": chunk,
            "chunk_length": len(chunk)
        })
    
    return pd.DataFrame(chunk_data)

# 4️⃣ Updated prompt for research papers
def build_primary_prompt(text_chunk):
    return f"""
You are a research paper knowledge extraction assistant specialized in comprehensive triple extraction. Your task is to extract ALL relevant information from the provided research paper CHUNK.

1. Named Entities:
Extract all distinct entities mentioned in the text, including:
- Authors and researchers
- Institutions and organizations
- Research methodologies and techniques
- Algorithms and models
- Datasets and data sources
- Software tools and frameworks
- Technical concepts and terms
- Performance metrics and measurements
- Experimental results and findings
- Related work and citations
- Mathematical concepts and formulas
- System architectures and components
- Evaluation criteria and benchmarks
- Statistical measures (accuracy, precision, recall, F1-score, p-values, etc.)
- Research contributions and innovations
- Limitations and challenges
- Future work directions
- Any other domain-specific entities

2. Triples:
Extract ALL subject–predicate–object relationships between those entities.
BE EXHAUSTIVE - don't miss any relationships, even subtle ones.

---
## Special Handling Rules:
- **Coreference**:
    - Resolve pronouns or references to the correct entity.
    - Example:  
      Text: "The algorithm improves accuracy. This method outperforms baselines."  
      → Triples:
        - ("The algorithm", "improves", "accuracy")
        - ("The algorithm", "outperforms", "baselines")

- **Apposition**:
    - Create an "is" triple for appositive structures.
    - Example:  
      Text: "BERT, a transformer model, achieves state-of-the-art results."  
      → Triples:
        - ("BERT", "is", "transformer model")
        - ("BERT", "achieves", "state-of-the-art results")

- **Multiple Subjects**:
    - Split coordinated subjects into individual triples.
    - Example:  
      Text: "CNN and RNN models were evaluated on the dataset."  
      → Triples:
        - ("CNN", "was evaluated on", "dataset")
        - ("RNN", "was evaluated on", "dataset")

- **Implicit Relationships**:
    - Extract relationships that are implied but not directly stated.
    - Example:
      Text: "We use PyTorch to implement the neural network which processes image data."
      → Triples:
        - ("Authors", "use", "PyTorch")
        - ("PyTorch", "implements", "neural network")
        - ("Neural network", "processes", "image data")
        - ("Authors", "implement", "neural network")

- **Negations and Qualifiers**:
    - Preserve negations in the predicate.
    - Example:
      Text: "The model does not require labeled data."
      → Triple:
        - ("Model", "does not require", "labeled data")
        
- **Performance and Results**:
    - Extract precise performance relationships and metrics.
    - Example:
      Text: "Our method achieved 95% accuracy on MNIST dataset, outperforming the baseline by 5%."
      → Triples:
        - ("Our method", "achieved", "95% accuracy")
        - ("Our method", "evaluated on", "MNIST dataset")
        - ("Our method", "outperforms", "baseline")
        - ("Our method", "outperforms baseline by", "5%")

- **Research Relationships**:
    - Extract relationships about research process, contributions, and comparisons.
    - Example:
      Text: "Smith et al. (2020) proposed a similar approach, but our work extends it by adding attention mechanisms."
      → Triples:
        - ("Smith et al. (2020)", "proposed", "similar approach")
        - ("Our work", "extends", "Smith et al. approach")
        - ("Our work", "adds", "attention mechanisms")

---

## Output Format:
#### ENTITIES:
- Entity 1
- Entity 2
...

#### TRIPLES:
("Subject", "Predicate", "Object")
("Subject", "Predicate", "Object")
...

---

TEXT:
{text_chunk}

#### ENTITIES:
"""

# 5️⃣ Updated verification prompt for research papers
def build_verification_prompt(text_chunk, entities, triples):
    return f"""
You are an expert research paper knowledge extraction verification system. Review this text and the already extracted entities and triples to identify ANY MISSED relationships.

TEXT:
{text_chunk}

ALREADY EXTRACTED ENTITIES:
{entities}

ALREADY EXTRACTED TRIPLES:
{triples}

YOUR TASK:
1. Carefully analyze if ANY possible relationship between entities was missed
2. Look specifically for:
   - Author-contribution relationships
   - Method-performance relationships
   - Dataset-evaluation relationships
   - Comparison relationships between methods/models
   - Temporal relationships (before/after, previous work)
   - Causal relationships (causes, enables, improves)
   - Implementation relationships (uses, implements, builds upon)
   - Experimental relationships (tested on, evaluated with, compared against)
   - Citation relationships (referenced by, builds on, extends)
   - Technical relationships (component of, part of, consists of)
   - Performance relationships (achieves, scores, measures)
   - Research contribution relationships (proposes, introduces, develops)

Output ONLY additional triples that were missed. Format as:
#### ADDITIONAL TRIPLES:
("Subject", "Predicate", "Object")
("Subject", "Predicate", "Object")
...

If no additional triples can be found, respond with "No additional triples identified."
"""

# 6️⃣ Claude call with retry logic (unchanged)
def query_claude(prompt, model="claude-3-haiku-20240307", retries=2, backoff=3):
    for attempt in range(retries + 1):
        try:
            response = client.messages.create(
                model=model,
                max_tokens=4000,
                temperature=0.0,
                messages=[{"role": "user", "content": prompt}]
            )
            return response.content[0].text.strip()
        except Exception as e:
            if attempt < retries:
                print(f"Retry {attempt+1}/{retries} after error: {str(e)}")
                time.sleep(backoff * (attempt + 1))
            else:
                return str(e)

# 7️⃣ Parse functions (unchanged)
def parse_claude_output(output):
    entities = []
    triples = []
    
    entities_section = re.search(r'#### ENTITIES:(.*?)(?=####|$)', output, re.DOTALL)
    if entities_section:
        entities_text = entities_section.group(1).strip()
        entities = [entity.strip('- ').strip() for entity in entities_text.split('\n') if entity.strip('- ').strip()]
    
    triple_pattern = r'\("([^"]+)", "([^"]+)", "([^"]+)"\)'
    for match in re.findall(triple_pattern, output):
        subject, predicate, obj = match
        triples.append((subject.strip(), predicate.strip(), obj.strip()))
    
    return entities, triples

def parse_verification_output(output):
    additional_triples = []
    triple_pattern = r'\("([^"]+)", "([^"]+)", "([^"]+)"\)'
    for match in re.findall(triple_pattern, output):
        subject, predicate, obj = match
        additional_triples.append((subject.strip(), predicate.strip(), obj.strip()))
    return additional_triples

# 🔥 MAIN PROCESSING FUNCTION
def process_research_paper(pdf_path, output_dir="C:/Users/theya/Downloads/"):
    """
    Main function to process research paper PDF and extract triples
    """
    print("📄 Extracting text from PDF...")
    full_text = extract_text_from_pdf(pdf_path)
    
    print("✂️ Chunking text...")
    df = chunk_text(full_text, chunk_size=1000, chunk_overlap=200)
    print(f"Created {len(df)} chunks")
    
    # Optional: TEST with first few chunks only
    # df = df.head(3)  # Uncomment for testing
    
    print("🧠 Starting triple extraction...")
    
    # Process chunks
    structured_results = []
    error_logs = []
    coverage_metrics = []
    
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing chunks"):
        doc_id = row["doc_ID"]
        chunk_id = row["chunk_ID"]
        text_chunk = str(row["chunk"])
        chunk_triples = []
        
        # First pass - extract entities and triples
        primary_prompt = build_primary_prompt(text_chunk)
        primary_output = query_claude(primary_prompt)
        
        if "error" in primary_output.lower() or "RateLimit" in primary_output:
            error_logs.append({
                "doc_id": str(doc_id),
                "chunk_id": str(chunk_id),
                "error_message": primary_output,
                "phase": "primary_extraction"
            })
            continue
        
        entities, primary_triples = parse_claude_output(primary_output)
        
        # Calculate initial metrics
        entity_count = len(entities)
        primary_triple_count = len(primary_triples)
        
        # Format triples for verification
        formatted_triples = "\n".join([f'("{s}", "{p}", "{o}")' for s, p, o in primary_triples])
        formatted_entities = "- " + "\n- ".join(entities)
        
        # Second pass - verification for missed relationships
        verification_prompt = build_verification_prompt(text_chunk, formatted_entities, formatted_triples)
        verification_output = query_claude(verification_prompt)
        
        additional_triples = []
        if "no additional triples" not in verification_output.lower():
            additional_triples = parse_verification_output(verification_output)
        
        # Combine and deduplicate triples
        all_triples = primary_triples + additional_triples
        unique_triples = []
        seen = set()
        
        for s, p, o in all_triples:
            triple_key = (s.lower(), p.lower(), o.lower())
            if triple_key not in seen:
                seen.add(triple_key)
                unique_triples.append((s, p, o))
                
                # Add to structured results
                chunk_triples.append({
                    "doc_id": str(doc_id),
                    "chunk_id": str(chunk_id),
                    "Subject": s,
                    "Predicate": p,
                    "Object": o,
                    "source": "primary" if (s, p, o) in primary_triples else "verification"
                })
        
        # Calculate coverage metrics
        metrics = {
            "doc_id": str(doc_id),
            "chunk_id": str(chunk_id),
            "text_length": len(text_chunk),
            "entity_count": entity_count,
            "primary_triple_count": primary_triple_count,
            "additional_triple_count": len(additional_triples),
            "final_triple_count": len(unique_triples),
            "triple_entity_ratio": len(unique_triples) / max(entity_count, 1),
            "triples_per_100_chars": (len(unique_triples) * 100) / max(len(text_chunk), 1)
        }
        coverage_metrics.append(metrics)
        
        # Add to results
        structured_results.extend(chunk_triples)
        
        time.sleep(1.2)  # Rate limiting
    
    # Save results
    print("💾 Saving results...")
    
    # Save triples
    output_df = pd.DataFrame(structured_results)
    output_df.to_csv(f"{output_dir}research_paper_triples.csv", index=False)
    print(f"✅ Saved {len(structured_results)} triples to research_paper_triples.csv")
    
    # Save error log
    if error_logs:
        error_df = pd.DataFrame(error_logs)
        error_df.to_csv(f"{output_dir}extraction_errors.csv", index=False)
        print(f"⚠️ {len(error_logs)} errors saved to extraction_errors.csv")
    
    # Save metrics
    metrics_df = pd.DataFrame(coverage_metrics)
    metrics_df.to_csv(f"{output_dir}extraction_metrics.csv", index=False)
    print("✅ Metrics saved to extraction_metrics.csv")
    
    # Save formatted triples for Neo4j import
    with open(f"{output_dir}neo4j_triples.txt", "w", encoding="utf-8") as f:
        for row in structured_results:
            # Format: (subject)-[predicate]->(object)
            triple = f'("{row["Subject"]}", "{row["Predicate"]}", "{row["Object"]}")'
            f.write(triple + "\n")
    print("✅ Neo4j-ready triples saved to neo4j_triples.txt")
    
    return structured_results, coverage_metrics

# 🚀 USAGE EXAMPLE
if __name__ == "__main__":
    # UPDATE THIS PATH TO YOUR PDF
    pdf_path = "Paper.pdf"
    
    # Process the paper
    triples, metrics = process_research_paper(pdf_path)
    
    print(f"\n📊 SUMMARY:")
    print(f"Total triples extracted: {len(triples)}")
    print(f"Average triples per chunk: {len(triples)/len(metrics):.2f}")
    
    # Show sample triples
    print(f"\n🔍 SAMPLE TRIPLES:")
    for i, triple in enumerate(triples[:10]):
        print(f"{i+1}. ({triple['Subject']}) --[{triple['Predicate']}]--> ({triple['Object']})")
    
    print(f"\n✅ DONE! Check the output files in your Downloads folder.")

📄 Extracting text from PDF...
✂️ Chunking text...
Created 49 chunks
🧠 Starting triple extraction...


Processing chunks: 100%|██████████| 49/49 [04:50<00:00,  5.94s/it]

💾 Saving results...
✅ Saved 1060 triples to research_paper_triples.csv
✅ Metrics saved to extraction_metrics.csv
✅ Neo4j-ready triples saved to neo4j_triples.txt

📊 SUMMARY:
Total triples extracted: 1060
Average triples per chunk: 21.63

🔍 SAMPLE TRIPLES:
1. (Ashish Vaswani) --[is affiliated with]--> (Google Brain)
2. (Noam Shazeer) --[is affiliated with]--> (Google Brain)
3. (Niki Parmar) --[is affiliated with]--> (Google Research)
4. (Jakob Uszkoreit) --[is affiliated with]--> (Google Research)
5. (Llion Jones) --[is affiliated with]--> (Google Research)
6. (Aidan N. Gomez) --[is affiliated with]--> (University of Toronto)
7. (Łukasz Kaiser) --[is affiliated with]--> (Google Brain)
8. (Illia Polosukhin) --[is affiliated with]--> (Google Research)
9. (Transformer) --[is a]--> (new simple network architecture)
10. (Transformer) --[is based solely on]--> (attention mechanisms)

✅ DONE! Check the output files in your Downloads folder.





In [2]:
pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import pandas as pd
import PyPDF2

def verify_chunk_coverage(pdf_path="Paper.pdf", 
                         output_dir="C:/Users/theya/Downloads/"):
    """
    Verify if your 49 chunks captured all content properly
    """
    
    print("🔍 VERIFYING CHUNK COVERAGE...")
    
    # 1. Extract original PDF text
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        full_text = ""
        
        for page_num, page in enumerate(pdf_reader.pages):
            page_text = page.extract_text()
            full_text += f"\n--- Page {page_num + 1} ---\n{page_text}"
    
    # 2. Recreate the chunking process
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,        # Your current setting
        chunk_overlap=200,      # Your current setting
        separators=["\n\n", "\n", ".", "!", "?", ";", ":", ",", " ", ""]
    )
    
    chunks = text_splitter.split_text(full_text)
    
    # 3. Basic coverage verification
    print(f"📄 ORIGINAL PDF:")
    print(f"   Total pages: {len(pdf_reader.pages)}")
    print(f"   Total characters: {len(full_text):,}")
    print(f"   Word count (approx): {len(full_text.split()):,}")
    
    print(f"\n✂️ CHUNKING RESULTS:")
    print(f"   Total chunks created: {len(chunks)}")
    print(f"   Your processing result: 49 chunks")
    print(f"   ✅ Match: {'YES' if len(chunks) == 49 else 'NO - MISMATCH!'}")
    
    # 4. Chunk size analysis
    chunk_lengths = [len(chunk) for chunk in chunks]
    
    print(f"\n📏 CHUNK SIZE ANALYSIS:")
    print(f"   Average chunk length: {sum(chunk_lengths)/len(chunk_lengths):.0f} chars")
    print(f"   Minimum chunk length: {min(chunk_lengths)} chars")
    print(f"   Maximum chunk length: {max(chunk_lengths)} chars")
    print(f"   Total chars in chunks: {sum(chunk_lengths):,}")
    print(f"   Coverage: {(sum(chunk_lengths)/len(full_text)*100):.1f}%")
    
    # 5. Check for potential content loss
    # Reconstruct text from chunks (without overlap)
    reconstructed_length = 0
    for i, chunk in enumerate(chunks):
        if i == 0:
            reconstructed_length += len(chunk)
        else:
            # Remove overlap (approximate)
            reconstructed_length += len(chunk) - 200  # overlap size
    
    coverage_ratio = reconstructed_length / len(full_text)
    
    print(f"\n🎯 CONTENT COVERAGE CHECK:")
    print(f"   Estimated coverage: {coverage_ratio*100:.1f}%")
    if coverage_ratio > 0.95:
        print("   ✅ EXCELLENT: Very high coverage, minimal content loss")
    elif coverage_ratio > 0.85:
        print("   ✅ GOOD: Good coverage, acceptable overlap")
    else:
        print("   ⚠️ WARNING: Potential content loss detected")
    
    # 6. Load your extraction results for quality check
    try:
        triples_df = pd.read_csv(f"{output_dir}research_paper_triples.csv")
        metrics_df = pd.read_csv(f"{output_dir}extraction_metrics.csv")
        
        print(f"\n📊 EXTRACTION QUALITY:")
        print(f"   Chunks processed: {len(metrics_df)}")
        print(f"   Total triples: {len(triples_df)}")
        print(f"   Avg triples/chunk: {len(triples_df)/len(metrics_df):.1f}")
        
        # Check for empty or low-quality chunks
        low_quality_chunks = metrics_df[metrics_df['final_triple_count'] < 5]
        print(f"   Low-quality chunks (<5 triples): {len(low_quality_chunks)}")
        
        if len(low_quality_chunks) > 0:
            print(f"   ⚠️ Check these chunks: {low_quality_chunks['chunk_id'].tolist()}")
        
        # Show chunk distribution
        print(f"\n📈 TRIPLE DISTRIBUTION:")
        print(f"   Min triples per chunk: {metrics_df['final_triple_count'].min()}")
        print(f"   Max triples per chunk: {metrics_df['final_triple_count'].max()}")
        print(f"   Median triples per chunk: {metrics_df['final_triple_count'].median()}")
        
    except FileNotFoundError:
        print("   ❌ Could not find extraction results files")
    
    # 7. Sample chunk analysis
    print(f"\n🔍 SAMPLE CHUNKS:")
    for i in [0, len(chunks)//2, -1]:  # First, middle, last
        chunk_num = i if i >= 0 else len(chunks) + i
        print(f"   Chunk {chunk_num+1}: {len(chunks[i])} chars")
        print(f"   Preview: {chunks[i][:100]}...")
        print()
    
    return {
        "total_chunks": len(chunks),
        "coverage_ratio": coverage_ratio,
        "avg_chunk_length": sum(chunk_lengths)/len(chunk_lengths),
        "chunks_match": len(chunks) == 49
    }

# 🚀 RUN THE VERIFICATION
if __name__ == "__main__":
    results = verify_chunk_coverage()
    
    print("=" * 60)
    if results["chunks_match"] and results["coverage_ratio"] > 0.95:
        print("✅ VERDICT: Your 49 chunks look GOOD!")
        print("   - Chunk count matches")
        print("   - High content coverage")
        print("   - Reasonable chunk sizes")
    else:
        print("⚠️ VERDICT: Consider adjusting chunk settings")
        if not results["chunks_match"]:
            print("   - Chunk count mismatch detected")
        if results["coverage_ratio"] <= 0.95:
            print("   - Potential content loss")

🔍 VERIFYING CHUNK COVERAGE...
📄 ORIGINAL PDF:
   Total pages: 15
   Total characters: 39,718
   Word count (approx): 6,058

✂️ CHUNKING RESULTS:
   Total chunks created: 49
   Your processing result: 49 chunks
   ✅ Match: YES

📏 CHUNK SIZE ANALYSIS:
   Average chunk length: 960 chars
   Minimum chunk length: 836 chars
   Maximum chunk length: 999 chars
   Total chars in chunks: 47,048
   Coverage: 118.5%

🎯 CONTENT COVERAGE CHECK:
   Estimated coverage: 94.3%
   ✅ GOOD: Good coverage, acceptable overlap

📊 EXTRACTION QUALITY:
   Chunks processed: 49
   Total triples: 1060
   Avg triples/chunk: 21.6
   Low-quality chunks (<5 triples): 2
   ⚠️ Check these chunks: ['chunk_048', 'chunk_049']

📈 TRIPLE DISTRIBUTION:
   Min triples per chunk: 0
   Max triples per chunk: 87
   Median triples per chunk: 17.0

🔍 SAMPLE CHUNKS:
   Chunk 1: 996 chars
   Preview: --- Page 1 ---
Provided proper attribution is provided, Google hereby grants permission to
reproduce...

   Chunk 25: 954 chars
   Previ