In [1]:
from elasticsearch import Elasticsearch, helpers

In [3]:
import os 

es = Elasticsearch(
    hosts=["http://localhost:9200"],
    basic_auth=('elastic', os.getenv('ES_PASSWORD', 'Lyx19930115'))
)

print(f"the es status is {es}")

# Define the role with full access
role_name = "aaronlu_full_access"
try:
    # First check if role exists
    existing_role = es.security.get_role(name=role_name)
    print(f"Role already exists: {existing_role}")
except Exception as e:
    # If role doesn't exist, create it
    role_body = {
        "cluster": ["all"],
        "indices": [
            {
                "names": ["*"],
                "privileges": ["all"]
            }
        ],
        "applications": [
            {
                "application": "*",
                "privileges": ["*"],
                "resources": ["*"]
            }
        ]
    }

    try:
        response = es.security.put_role(name=role_name, body=role_body)
        print(f"Role creation response: {response}")
    except Exception as e:
        print(f"Error creating role: {e}")


the es status is <Elasticsearch(['http://localhost:9200'])>
Role already exists: {'aaronlu_full_access': {'cluster': ['all'], 'indices': [{'names': ['*'], 'privileges': ['all'], 'allow_restricted_indices': False}], 'applications': [{'application': '*', 'privileges': ['*'], 'resources': ['*']}], 'run_as': [], 'metadata': {}, 'transient_metadata': {'enabled': True}}}


In [18]:
try:
    if es.indices.exists(index='default'):
        es.indices.delete(index='default')
        print("Successfully deleted index 'default'")
    else:
        print("Index 'default' does not exist")
except Exception as e:
    print(f"Error deleting index: {str(e)}")

INFO:elastic_transport.transport:HEAD http://localhost:9200/default [status:200 duration:0.005s]
INFO:elastic_transport.transport:DELETE http://localhost:9200/default [status:200 duration:0.228s]


Successfully deleted index 'default'


In [13]:
import logging

def debug_document_structure(es, index_name, patent_index):
    """
    Debug helper to check document structure in Elasticsearch.
    """
    query = {
        "query": {
            "term": {
                "patent_index": patent_index
            }
        }
    }
    
    try:
        response = es.search(index=index_name, body=query)
        if response['hits']['total']['value'] > 0:
            doc = response['hits']['hits'][0]
            logger.debug(f"Document structure for {patent_index}:")
            logger.debug(json.dumps(doc, indent=2))
            return doc
    except Exception as e:
        logger.error(f"Error checking document structure: {str(e)}")
    return None

In [14]:
# Check a specific document
debug_document_structure(es, "default", "US20140000892A1.pdf")

INFO:elastic_transport.transport:POST http://localhost:9200/default/_search [status:200 duration:0.011s]


{'_index': 'default',
 '_id': 'NUiQ8JIB8CzkWQZxYp9k',
 '_score': 0.6931471,
 '_ignored': ['chunks.text.keyword'],
 '_source': {'patent_index': 'US20140000892A1.pdf',
  'chunks': [{'chunk_index': 1,
    'text': 'US 201400.00892A1 \n (19) United States \n (12) Patent Application Publication (10) Pub. No.: US 2014/0000892 A1 \n de La Roij (43) Pub. Date: Jan. 2, 2014 \n (54) USE OF AN ADDITIVE COMPOSTION FOR (52) U.S. Cl. \n CEMENTING BORE WELLS CPC ...................................... E21B33/13 (2013.01) \n USPC ............................ 166/293; 106/801: 106/638 (71) Applicant: MEGA-TECH HOLDING B.V. \n (72) Inventor: Robin de La Roij. Zwijndrecht (NL) (57) ABSTRACT \n (73) Assignee: MEGA-tech Holding B.V. \n (21) Appl. No.: 13/654,920 The present invention relates to the use of a composition for reinforcing cement, which comprises one or more com \n (22) Filed: Oct. 18, 2012 pounds selected from group: a) sodium chloride, potassium \n chloride, magnesium chloride, calcium chloride

In [4]:
# Import required libraries
from elasticsearch import Elasticsearch
import json
import pandas as pd
from pprint import pprint
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


# Get index mapping
def get_index_mapping(es, index_name):
    try:
        mapping = es.indices.get_mapping(index=index_name)
        return mapping
    except Exception as e:
        logger.error(f"Error getting mapping: {e}")
        return None

# Get index stats
def get_index_stats(es, index_name):
    try:
        stats = es.indices.stats(index=index_name)
        return stats
    except Exception as e:
        logger.error(f"Error getting stats: {e}")
        return None

# Get sample documents
def get_sample_documents(es, index_name, size=5):
    try:
        response = es.search(
            index=index_name,
            body={
                "query": {"match_all": {}},
                "size": size
            }
        )
        return response['hits']['hits']
    except Exception as e:
        logger.error(f"Error getting sample documents: {e}")
        return None

# Analyze document structure
def analyze_document_structure(doc):
    analysis = {
        "fields": list(doc.keys()),
        "chunks_count": len(doc.get("chunks", [])),
        "has_embeddings": False,
        "sample_chunk": None,
        "empty_chunks": 0,
        "valid_chunks": 0
    }
    
    if "chunks" in doc:
        for chunk in doc["chunks"]:
            if "embedding" in chunk:
                analysis["has_embeddings"] = True
            
            if not chunk.get("text", "").strip():
                analysis["empty_chunks"] += 1
            else:
                analysis["valid_chunks"] += 1
            
            if analysis["sample_chunk"] is None:
                analysis["sample_chunk"] = chunk
    
    return analysis

# Main inspection function
def inspect_elasticsearch_data(es, index_name):
    results = {}
    
    # Get mapping
    logger.info("Getting index mapping...")
    mapping = get_index_mapping(es, index_name)
    if mapping:
        results["mapping"] = mapping
        
    # Get stats
    logger.info("Getting index stats...")
    stats = get_index_stats(es, index_name)
    if stats:
        results["doc_count"] = stats["_all"]["primaries"]["docs"]["count"]
        
    # Get sample documents
    logger.info("Getting sample documents...")
    samples = get_sample_documents(es, index_name)
    if samples:
        results["samples"] = []
        for hit in samples:
            doc = hit["_source"]
            analysis = analyze_document_structure(doc)
            results["samples"].append({
                "patent_index": doc.get("patent_index"),
                "analysis": analysis
            })
    
    return results

# Function to print results in a readable format
def print_inspection_results(results):
    print("\n=== Elasticsearch Data Inspection Results ===\n")
    
    if "mapping" in results:
        print("Index Mapping:")
        pprint(results["mapping"])
        print("\n")
    
    if "doc_count" in results:
        print(f"Total Documents: {results['doc_count']}\n")
    
    if "samples" in results:
        print("Sample Document Analysis:")
        for sample in results["samples"]:
            print(f"\nDocument: {sample['patent_index']}")
            analysis = sample["analysis"]
            print(f"Fields: {analysis['fields']}")
            print(f"Total Chunks: {analysis['chunks_count']}")
            print(f"Valid Chunks: {analysis['valid_chunks']}")
            print(f"Empty Chunks: {analysis['empty_chunks']}")
            print(f"Has Embeddings: {analysis['has_embeddings']}")
            if analysis["sample_chunk"]:
                print("\nSample Chunk Structure:")
                pprint(analysis["sample_chunk"])

# Create DataFrame for analysis
def create_analysis_dataframe(results):
    data = []
    for sample in results.get("samples", []):
        analysis = sample["analysis"]
        data.append({
            "patent_index": sample["patent_index"],
            "total_chunks": analysis["chunks_count"],
            "valid_chunks": analysis["valid_chunks"],
            "empty_chunks": analysis["empty_chunks"],
            "has_embeddings": analysis["has_embeddings"]
        })
    return pd.DataFrame(data)

# Running the inspection
if __name__ == "__main__":
    # Connect to Elasticsearch
    if es:
        # Run inspection
        results = inspect_elasticsearch_data(es, "default")  # Replace "default" with your index name
        
        # Print results
        print_inspection_results(results)
        
        # Create DataFrame
        df = create_analysis_dataframe(results)
        print("\nAnalysis Summary:")
        print(df)

INFO:__main__:Getting index mapping...
INFO:elastic_transport.transport:GET http://localhost:9200/default/_mapping [status:200 duration:0.004s]
INFO:__main__:Getting index stats...
INFO:elastic_transport.transport:GET http://localhost:9200/default/_stats [status:200 duration:0.049s]
INFO:__main__:Getting sample documents...
INFO:elastic_transport.transport:POST http://localhost:9200/default/_search [status:200 duration:0.100s]



=== Elasticsearch Data Inspection Results ===

Index Mapping:
ObjectApiResponse({'default': {'mappings': {'properties': {'chunks': {'type': 'nested', 'properties': {'chunk_index': {'type': 'integer'}, 'embedding': {'type': 'dense_vector', 'dims': 1536, 'index': True, 'similarity': 'cosine'}, 'is_abstract': {'type': 'boolean'}, 'is_claims': {'type': 'boolean'}, 'is_fto': {'type': 'boolean'}, 'is_patentability': {'type': 'boolean'}, 'text': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}}}, 'patent_index': {'type': 'keyword'}}}}})


Total Documents: 3069

Sample Document Analysis:

Document: ColoradoCancerCoalitionResourceDirectory2014.pdf
Fields: ['patent_index', 'chunks']
Total Chunks: 64
Valid Chunks: 64
Empty Chunks: 0
Has Embeddings: True

Sample Chunk Structure:
{'chunk_index': 0,
 'embedding': [0.00181927892845124,
               0.003492350922897458,
               -0.0076891533099114895,
               -0.05053441971540451,
               -0.01

In [12]:
def inspect_document(es, index_name, patent_id):
    query = {
        "query": {
            "term": {
                "patent_index": patent_id
            }
        }
    }
    
    response = es.search(index=index_name, body=query)
    if response['hits']['hits']:
        doc = response['hits']['hits'][0]['_source']
        chunks = doc.get('chunks', [])
        
        print(f"\nDocument: {patent_id}")
        print(f"Total chunks: {len(chunks)}")
        
        valid_chunks = sum(1 for c in chunks if c.get('text', '').strip())
        print(f"Valid chunks: {valid_chunks}")
        print(f"Empty chunks: {len(chunks) - valid_chunks}")
        
        if chunks:
            print("\nFirst valid chunk:")
            for chunk in chunks:
                if chunk.get('text', '').strip():
                    pprint(chunk)
                    break
    else:
        print(f"Document {patent_id} not found")

# Inspect the problematic document
inspect_document(es, 'default', 'US11111181.pdf')

INFO:elastic_transport.transport:POST http://localhost:9200/default/_search [status:200 duration:0.084s]



Document: US11111181.pdf
Total chunks: 18
Valid chunks: 18
Empty chunks: 0

First valid chunk:
{'chunk_index': 0,
 'embedding': [-0.025140712037682533,
               0.02507183328270912,
               0.004211930092424154,
               2.7981956009170972e-05,
               0.016062503680586815,
               0.025402450934052467,
               -0.03647813946008682,
               -0.018624790012836456,
               -0.029011692851781845,
               -0.02649073302745819,
               0.00881646852940321,
               0.0364505872130394,
               -0.006078541744500399,
               0.02155902050435543,
               0.0007111721788533032,
               0.012129532173275948,
               0.026738695800304413,
               0.018555911257863045,
               0.01007006037980318,
               -0.011461409740149975,
               0.005468965973705053,
               -0.002644940512254834,
               -0.019795726984739304,
               -0.003282068297

In [11]:
print("Checking index contents...")
response = es.search(
    index="default",
    body={"query": {"match_all": {}}}
)
print(f"Total documents in index: {response['hits']['total']['value']}")

Checking index contents...


INFO:elastic_transport.transport:POST http://localhost:9200/default/_search [status:200 duration:0.376s]


Total documents in index: 105


In [14]:
def print_nested_keys(data, prefix=''):
    """Recursively print all keys in a nested dictionary"""
    if isinstance(data, dict):
        for key, value in data.items():
            print(f"{prefix}{key}")
            if isinstance(value, (dict, list)):
                print_nested_keys(value, prefix + '  ')
    elif isinstance(data, list) and len(data) > 0:
        # For lists, check the first item's structure
        print_nested_keys(data[0], prefix + '  ')

# Check document structure and keys
print("Checking document keys...")
response = es.search(
    index="default",
    body={"query": {"match_all": {}}}
)

for hit in response['hits']['hits'][:1]:
    print("\nDocument keys structure:")
    print_nested_keys(hit['_source'])

DEBUG:urllib3.connectionpool:http://localhost:9200 "POST /default/_search HTTP/11" 200 None


INFO:elastic_transport.transport:POST http://localhost:9200/default/_search [status:200 duration:0.024s]


Checking document keys...

Document keys structure:
patent_index
chunks
    chunk_index
    text
    embedding
    is_claims
    is_abstract
    is_patentability
    is_fto


In [11]:
# Import required libraries
from elasticsearch import Elasticsearch
import logging
from operator import itemgetter
import json

# Set up logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

# Test data to insert
test_document = {
    "patent_index": "TEST123",
    "chunks": [
        {
            "chunk_index": 0,
            "text": "This is a test patent about machine learning technology.",
            "is_abstract": True,
            "is_patentability": True,
            "is_claims": True,
            "embedding": [0.1] * 1536  # Assuming your embedding size is 1536
        },
        {
            "chunk_index": 1,
            "text": "The invention relates to artificial intelligence systems.",
            "is_abstract": True,
            "is_patentability": True,
            "is_claims": True,
            "embedding": [0.2] * 1536
        }
    ]
}

# Function to create test index and insert test data
def setup_test_index():
    index_name = "test_patent_index"
    
    # Delete index if it exists
    if es.indices.exists(index=index_name):
        es.indices.delete(index=index_name)
    
    # Create index with mapping
    mapping = {
        "mappings": {
            "properties": {
                "patent_index": {"type": "keyword"},
                "chunks": {
                    "type": "nested",
                    "properties": {
                        "chunk_index": {"type": "integer"},
                        "text": {"type": "text"},
                        "is_abstract": {"type": "boolean"},
                        "is_patentability": {"type": "boolean"},
                        "is_claims": {"type": "boolean"},
                        "embedding": {
                            "type": "dense_vector",
                            "dims": 1536,
                            "index": True,
                            "similarity": "cosine"
                        }
                    }
                }
            }
        }
    }
    
    es.indices.create(index=index_name, body=mapping)
    logger.info(f"Created index: {index_name}")
    
    # Insert test document
    es.index(index=index_name, id=1, body=test_document)
    es.indices.refresh(index=index_name)
    logger.info("Inserted test document")
    
    return index_name

# Test BM25 search
def test_bm25_search(index_name, query_text):
    logger.info(f"Testing BM25 search with query: {query_text}")
    
    query = {
        "query": {
            "nested": {
                "path": "chunks",
                "query": {
                    "bool": {
                        "must": [
                            {"match": {"chunks.text": query_text}},
                            {"term": {"chunks.is_abstract": True}},
                            {"term": {"chunks.is_patentability": True}},
                            {"term": {"chunks.is_claims": True}}
                        ]
                    }
                },
                "inner_hits": {
                    "size": 5
                }
            }
        },
        "_source": ["patent_index"]
    }
    
    response = es.search(index=index_name, body=query)
    logger.info(f"BM25 search returned {len(response['hits']['hits'])} hits")
    
    results = []
    for hit in response['hits']['hits']:
        patent_index = hit['_source']['patent_index']
        for inner_hit in hit['inner_hits']['chunks']['hits']['hits']:
            chunk = inner_hit['_source']
            results.append({
                "patent_id": str(patent_index),
                "chunk_index": chunk['chunk_index'],
                "text": chunk['text'],
                "score": inner_hit['_score']
            })
    
    return results

# Test semantic search
def test_semantic_search(index_name, query_embedding):
    logger.info("Testing semantic search")
    
    query = {
        "query": {
            "nested": {
                "path": "chunks",
                "query": {
                    "script_score": {
                        "query": {
                            "bool": {
                                "must": [
                                    {"term": {"chunks.is_abstract": True}},
                                    {"term": {"chunks.is_patentability": True}},
                                    {"term": {"chunks.is_claims": True}}
                                ]
                            }
                        },
                        "script": {
                            "source": "cosineSimilarity(params.query_vector, 'chunks.embedding') + 1.0",
                            "params": {"query_vector": query_embedding}
                        }
                    }
                },
                "inner_hits": {
                    "size": 5
                }
            }
        },
        "_source": ["patent_index"]
    }
    
    response = es.search(index=index_name, body=query)
    logger.info(f"Semantic search returned {len(response['hits']['hits'])} hits")
    
    results = []
    for hit in response['hits']['hits']:
        patent_index = hit['_source']['patent_index']
        for inner_hit in hit['inner_hits']['chunks']['hits']['hits']:
            chunk = inner_hit['_source']
            results.append({
                "patent_id": str(patent_index),
                "chunk_index": chunk['chunk_index'],
                "text": chunk['text'],
                "score": hit['_score']
            })
    
    return results

# Run tests
if __name__ == "__main__":
    # Setup test index and data
    test_index = setup_test_index()
    
    # Test queries
    test_query = "machine learning"
    test_embedding = [0.1] * 1536  # Simple test embedding
    
    print("\nTesting BM25 Search:")
    bm25_results = test_bm25_search(test_index, test_query)
    print(f"BM25 Results: {json.dumps(bm25_results, indent=2)}")
    
    print("\nTesting Semantic Search:")
    semantic_results = test_semantic_search(test_index, test_embedding)
    print(f"Semantic Results: {json.dumps(semantic_results, indent=2)}")

DEBUG:urllib3.connectionpool:http://localhost:9200 "HEAD /test_patent_index HTTP/11" 404 0
INFO:elastic_transport.transport:HEAD http://localhost:9200/test_patent_index [status:404 duration:0.005s]


DEBUG:urllib3.connectionpool:http://localhost:9200 "PUT /test_patent_index HTTP/11" 200 0
INFO:elastic_transport.transport:PUT http://localhost:9200/test_patent_index [status:200 duration:0.711s]
INFO:__main__:Created index: test_patent_index
DEBUG:urllib3.connectionpool:http://localhost:9200 "PUT /test_patent_index/_doc/1 HTTP/11" 201 0
INFO:elastic_transport.transport:PUT http://localhost:9200/test_patent_index/_doc/1 [status:201 duration:0.034s]
DEBUG:urllib3.connectionpool:http://localhost:9200 "POST /test_patent_index/_refresh HTTP/11" 200 0
INFO:elastic_transport.transport:POST http://localhost:9200/test_patent_index/_refresh [status:200 duration:0.103s]
INFO:__main__:Inserted test document
INFO:__main__:Testing BM25 search with query: machine learning
DEBUG:urllib3.connectionpool:http://localhost:9200 "POST /test_patent_index/_search HTTP/11" 200 None
INFO:elastic_transport.transport:POST http://localhost:9200/test_patent_index/_search [status:200 duration:0.012s]
INFO:__main__:


Testing BM25 Search:
BM25 Results: [
  {
    "patent_id": "TEST123",
    "chunk_index": 0,
    "text": "This is a test patent about machine learning technology.",
    "score": 1.8658177
  }
]

Testing Semantic Search:
Semantic Results: [
  {
    "patent_id": "TEST123",
    "chunk_index": 0,
    "text": "This is a test patent about machine learning technology.",
    "score": 1.999991
  },
  {
    "patent_id": "TEST123",
    "chunk_index": 1,
    "text": "The invention relates to artificial intelligence systems.",
    "score": 1.999991
  }
]
