In [3]:
import json
import os
from typing import List, Dict, Any
from openai import OpenAI
import logging
from dotenv import load_dotenv
from tqdm import tqdm

# Set up logging
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

class EmbeddingGenerator:
    def __init__(self):
        """Initialize OpenAI client."""
        self.openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

    def get_embedding(self, text: str) -> List[float]:
        """Get embedding for a text using OpenAI's API."""
        try:
            response = self.openai_client.embeddings.create(
                model="text-embedding-ada-002",
                input=text
            )
            return response.data[0].embedding
        except Exception as e:
            logger.error(f"Error getting embedding: {str(e)}")
            raise

    def process_batch(self, batch: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """Process a batch of documents and create their embeddings."""
        processed_docs = []
        
        for doc in batch:
            try:
                # Log the document structure
                logger.debug(f"Processing document: {doc.keys()}")
                
                if not all(key in doc for key in ['subject', 'predicate', 'object', 'source', 'original_text']):
                    logger.warning(f"Document missing required fields. Found keys: {doc.keys()}")
                    continue
                
                embedding = self.get_embedding(doc['original_text'])
                
                processed_doc = {
                    'subject': doc['subject'],
                    'predicate': doc['predicate'],
                    'object': doc['object'],
                    'source': doc['source'],
                    'original_text': doc['original_text'],
                    'embedding': embedding
                }
                
                processed_docs.append(processed_doc)
                logger.debug(f"Successfully processed document for: {doc['subject']}")
                
            except Exception as e:
                logger.error(f"Error processing document: {str(e)}")
                continue
                
        return processed_docs

    def process_json_file(self, input_path: str, output_path: str, batch_size: int = 10):
        """Process JSON file and create embeddings."""
        try:
            # Load and validate input data
            logger.info(f"Loading data from {input_path}")
            with open(input_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # Debug print the structure
            logger.debug("Input data structure:")
            if isinstance(data, dict):
                logger.debug(f"Keys in data: {list(data.keys())}")
            else:
                logger.debug(f"Data type: {type(data)}")
            
            # Handle both possible data structures
            if isinstance(data, dict):
                logger.info("Processing data from dictionary structure")
                documents = data.get('relationships', [])
            elif isinstance(data, list):
                logger.info("Processing data from list structure")
                documents = data
            else:
                raise ValueError(f"Unexpected data structure: {type(data)}")
            
            logger.info(f"Found {len(documents)} documents to process")
            
            # Process in batches
            processed_docs = []
            for i in tqdm(range(0, len(documents), batch_size), desc="Processing documents"):
                batch = documents[i:i + batch_size]
                batch_results = self.process_batch(batch)
                processed_docs.extend(batch_results)
                logger.debug(f"Processed batch {i//batch_size + 1}, got {len(batch_results)} results")
            
            # Create output directory if needed
            os.makedirs(os.path.dirname(output_path), exist_ok=True)
            
            # Save processed documents
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(processed_docs, f, ensure_ascii=False, indent=2)
            
            logger.info(f"Saved {len(processed_docs)} documents with embeddings to {output_path}")
            
            return {
                "success": True,
                "documents_processed": len(processed_docs),
                "total_documents": len(documents),
                "output_path": output_path
            }
            
        except Exception as e:
            logger.error(f"Error processing JSON file: {str(e)}")
            logger.error(f"Traceback: {traceback.format_exc()}")
            raise

def main():
    """Main function to generate embeddings."""
    try:
        generator = EmbeddingGenerator()
        
        input_path = r"D:\Dropbox\29. Ampelos\24_PED\PED_PITT_Aaron\backend\PDFs_Share\pdf_json_output\scd_entities_relationships_total_deduplicated.json"
        output_path = r"D:\Dropbox\29. Ampelos\24_PED\PED_PITT_Aaron\backend\PDFs_Share\pdf_json_output\scd_entities_relationships_total_deduplicated_with_embeddings.json"
        
        # Print input file structure
        with open(input_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            logger.info("Input file structure:")
            if isinstance(data, dict):
                logger.info(f"Dictionary with keys: {list(data.keys())}")
                if 'relationships' in data:
                    logger.info(f"Number of relationships: {len(data['relationships'])}")
                    if data['relationships']:
                        logger.info(f"Sample relationship keys: {list(data['relationships'][0].keys())}")
            else:
                logger.info(f"Data type: {type(data)}, Length: {len(data) if isinstance(data, list) else 'N/A'}")
        
        result = generator.process_json_file(input_path, output_path)
        logger.info(f"Processing complete: {result}")
        return result
    
    except Exception as e:
        logger.error(f"Error in main: {str(e)}")
        logger.error(f"Traceback: {traceback.format_exc()}")
        raise

if __name__ == "__main__":
    main()

2025-01-03 09:55:19,224 - INFO - Input file structure:
2025-01-03 09:55:19,225 - INFO - Dictionary with keys: ['entities', 'relationships']
2025-01-03 09:55:19,226 - INFO - Number of relationships: 2134
2025-01-03 09:55:19,227 - INFO - Sample relationship keys: ['subject', 'predicate', 'object', 'source', 'original_text']
2025-01-03 09:55:19,228 - INFO - Loading data from D:\Dropbox\29. Ampelos\24_PED\PED_PITT_Aaron\backend\PDFs_Share\pdf_json_output\scd_entities_relationships_total_deduplicated.json
2025-01-03 09:55:19,300 - INFO - Processing data from dictionary structure
2025-01-03 09:55:19,301 - INFO - Found 2134 documents to process
Processing documents:   0%|          | 0/214 [00:00<?, ?it/s]2025-01-03 09:55:20,086 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-01-03 09:55:20,537 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-01-03 09:55:20,745 - INFO - HTTP Request: POST https://api.openai.com/v1/e

### 2. 检查刚刚是否所有的 input 文件都做了 embeddings

In [2]:
import json
from pathlib import Path
import logging

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

def check_embedding_progress(input_path: str, output_path: str):
    """
    Check the progress of embedding generation by comparing input and output files.
    """
    try:
        # Load input file
        logger.info(f"Loading input file: {input_path}")
        with open(input_path, 'r', encoding='utf-8') as f:
            input_data = json.load(f)
        
        # Get input relationships
        if isinstance(input_data, dict) and 'relationships' in input_data:
            input_relationships = input_data['relationships']
        else:
            logger.error("Invalid input file structure")
            return
        
        total_input = len(input_relationships)
        logger.info(f"Total relationships in input file: {total_input}")
        
        # Check if output file exists
        output_path = Path(output_path)
        if not output_path.exists():
            logger.warning("Output file does not exist yet")
            return
        
        # Load output file
        logger.info(f"Loading output file: {output_path}")
        with open(output_path, 'r', encoding='utf-8') as f:
            output_data = json.load(f)
        
        # Get output relationships
        if isinstance(output_data, list):
            processed_relationships = output_data
        elif isinstance(output_data, dict) and 'relationships' in output_data:
            processed_relationships = output_data['relationships']
        else:
            logger.error("Invalid output file structure")
            return
        
        total_processed = len(processed_relationships)
        
        # Check for embeddings
        relationships_with_embeddings = [rel for rel in processed_relationships if 'embedding' in rel]
        total_with_embeddings = len(relationships_with_embeddings)
        
        # Calculate statistics
        completion_percentage = (total_with_embeddings / total_input) * 100
        remaining = total_input - total_with_embeddings
        
        # Print detailed analysis
        print("\nEmbedding Generation Progress Analysis:")
        print("======================================")
        print(f"Total relationships in input file: {total_input}")
        print(f"Total relationships processed: {total_processed}")
        print(f"Relationships with embeddings: {total_with_embeddings}")
        print(f"Completion percentage: {completion_percentage:.2f}%")
        print(f"Remaining relationships: {remaining}")
        
        # Check for any relationships without embeddings
        if total_processed > total_with_embeddings:
            missing_embeddings = total_processed - total_with_embeddings
            print(f"\nWarning: {missing_embeddings} processed relationships are missing embeddings")
            
            # Sample of relationships without embeddings
            relationships_without_embeddings = [
                rel for rel in processed_relationships 
                if 'embedding' not in rel
            ][:5]  # Show up to 5 examples
            
            if relationships_without_embeddings:
                print("\nSample relationships missing embeddings:")
                for rel in relationships_without_embeddings:
                    print(f"- Subject: {rel['subject']}, Predicate: {rel['predicate']}")
        
        # Verify embedding structure
        if relationships_with_embeddings:
            sample_embedding = relationships_with_embeddings[0]['embedding']
            print(f"\nEmbedding verification:")
            print(f"Embedding dimension: {len(sample_embedding)}")
            print(f"Embedding type: {type(sample_embedding)}")
            
        return {
            "total_input": total_input,
            "total_processed": total_processed,
            "total_with_embeddings": total_with_embeddings,
            "completion_percentage": completion_percentage,
            "remaining": remaining
        }
            
    except Exception as e:
        logger.error(f"Error checking embedding progress: {str(e)}")
        raise

if __name__ == "__main__":
    # Define file paths
    base_dir = Path(r"D:\Dropbox\29. Ampelos\24_PED\PED_PITT_Aaron\backend\PDFs_Share\pdf_json_output")
    input_file = "scd_entities_relationships_total.json"
    output_file = "scd_entities_relationships_total_with_embeddings.json"
    
    input_path = base_dir / input_file
    output_path = base_dir / output_file
    
    # Check progress
    result = check_embedding_progress(str(input_path), str(output_path))
    
    if result:
        # If there are remaining relationships, suggest next steps
        if result["remaining"] > 0:
            print("\nRecommended next steps:")
            print("1. Resume embedding generation for remaining relationships")
            print("2. Check for any errors in the log files")
            print("3. Consider processing in smaller batches if encountering timeout issues")

2025-01-03 00:14:06,290 - INFO - Loading input file: D:\Dropbox\29. Ampelos\24_PED\PED_PITT_Aaron\backend\PDFs_Share\pdf_json_output\scd_entities_relationships_total.json
2025-01-03 00:14:06,553 - INFO - Total relationships in input file: 10776
2025-01-03 00:14:06,554 - INFO - Loading output file: D:\Dropbox\29. Ampelos\24_PED\PED_PITT_Aaron\backend\PDFs_Share\pdf_json_output\scd_entities_relationships_total_with_embeddings.json



Embedding Generation Progress Analysis:
Total relationships in input file: 10776
Total relationships processed: 10764
Relationships with embeddings: 10764
Completion percentage: 99.89%
Remaining relationships: 12

Embedding verification:
Embedding dimension: 1536
Embedding type: <class 'list'>

Recommended next steps:
1. Resume embedding generation for remaining relationships
2. Check for any errors in the log files
3. Consider processing in smaller batches if encountering timeout issues
