In [2]:
from elasticsearch import Elasticsearch, helpers

In [10]:
import os 

es = Elasticsearch(
    hosts=["http://localhost:9200"],
    basic_auth=('elastic', os.getenv('ES_PASSWORD', 'Lyx19930115'))
)

print(f"the es status is {es}")

# Define the role with full access
role_name = "aaronlu_full_access"
try:
    # First check if role exists
    existing_role = es.security.get_role(name=role_name)
    print(f"Role already exists: {existing_role}")
except Exception as e:
    # If role doesn't exist, create it
    role_body = {
        "cluster": ["all"],
        "indices": [
            {
                "names": ["*"],
                "privileges": ["all"]
            }
        ],
        "applications": [
            {
                "application": "*",
                "privileges": ["*"],
                "resources": ["*"]
            }
        ]
    }

    try:
        response = es.security.put_role(name=role_name, body=role_body)
        print(f"Role creation response: {response}")
    except Exception as e:
        print(f"Error creating role: {e}")


DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): localhost:9200
DEBUG:urllib3.connectionpool:http://localhost:9200 "GET /_security/role/aaronlu_full_access HTTP/11" 200 0


INFO:elastic_transport.transport:GET http://localhost:9200/_security/role/aaronlu_full_access [status:200 duration:0.029s]


the es status is <Elasticsearch(['http://localhost:9200'])>
Role already exists: {'aaronlu_full_access': {'cluster': ['all'], 'indices': [{'names': ['*'], 'privileges': ['all'], 'allow_restricted_indices': False}], 'applications': [{'application': '*', 'privileges': ['*'], 'resources': ['*']}], 'run_as': [], 'metadata': {}, 'transient_metadata': {'enabled': True}}}


In [12]:
print("Checking index contents...")
response = es.search(
    index="default",
    body={"query": {"match_all": {}}}
)
print(f"Total documents in index: {response['hits']['total']['value']}")

DEBUG:urllib3.connectionpool:http://localhost:9200 "POST /default/_search HTTP/11" 200 None
INFO:elastic_transport.transport:POST http://localhost:9200/default/_search [status:200 duration:0.135s]


Checking index contents...
Total documents in index: 2


In [14]:
def print_nested_keys(data, prefix=''):
    """Recursively print all keys in a nested dictionary"""
    if isinstance(data, dict):
        for key, value in data.items():
            print(f"{prefix}{key}")
            if isinstance(value, (dict, list)):
                print_nested_keys(value, prefix + '  ')
    elif isinstance(data, list) and len(data) > 0:
        # For lists, check the first item's structure
        print_nested_keys(data[0], prefix + '  ')

# Check document structure and keys
print("Checking document keys...")
response = es.search(
    index="default",
    body={"query": {"match_all": {}}}
)

for hit in response['hits']['hits'][:1]:
    print("\nDocument keys structure:")
    print_nested_keys(hit['_source'])

DEBUG:urllib3.connectionpool:http://localhost:9200 "POST /default/_search HTTP/11" 200 None


INFO:elastic_transport.transport:POST http://localhost:9200/default/_search [status:200 duration:0.024s]


Checking document keys...

Document keys structure:
patent_index
chunks
    chunk_index
    text
    embedding
    is_claims
    is_abstract
    is_patentability
    is_fto


In [11]:
# Import required libraries
from elasticsearch import Elasticsearch
import logging
from operator import itemgetter
import json

# Set up logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

# Test data to insert
test_document = {
    "patent_index": "TEST123",
    "chunks": [
        {
            "chunk_index": 0,
            "text": "This is a test patent about machine learning technology.",
            "is_abstract": True,
            "is_patentability": True,
            "is_claims": True,
            "embedding": [0.1] * 1536  # Assuming your embedding size is 1536
        },
        {
            "chunk_index": 1,
            "text": "The invention relates to artificial intelligence systems.",
            "is_abstract": True,
            "is_patentability": True,
            "is_claims": True,
            "embedding": [0.2] * 1536
        }
    ]
}

# Function to create test index and insert test data
def setup_test_index():
    index_name = "test_patent_index"
    
    # Delete index if it exists
    if es.indices.exists(index=index_name):
        es.indices.delete(index=index_name)
    
    # Create index with mapping
    mapping = {
        "mappings": {
            "properties": {
                "patent_index": {"type": "keyword"},
                "chunks": {
                    "type": "nested",
                    "properties": {
                        "chunk_index": {"type": "integer"},
                        "text": {"type": "text"},
                        "is_abstract": {"type": "boolean"},
                        "is_patentability": {"type": "boolean"},
                        "is_claims": {"type": "boolean"},
                        "embedding": {
                            "type": "dense_vector",
                            "dims": 1536,
                            "index": True,
                            "similarity": "cosine"
                        }
                    }
                }
            }
        }
    }
    
    es.indices.create(index=index_name, body=mapping)
    logger.info(f"Created index: {index_name}")
    
    # Insert test document
    es.index(index=index_name, id=1, body=test_document)
    es.indices.refresh(index=index_name)
    logger.info("Inserted test document")
    
    return index_name

# Test BM25 search
def test_bm25_search(index_name, query_text):
    logger.info(f"Testing BM25 search with query: {query_text}")
    
    query = {
        "query": {
            "nested": {
                "path": "chunks",
                "query": {
                    "bool": {
                        "must": [
                            {"match": {"chunks.text": query_text}},
                            {"term": {"chunks.is_abstract": True}},
                            {"term": {"chunks.is_patentability": True}},
                            {"term": {"chunks.is_claims": True}}
                        ]
                    }
                },
                "inner_hits": {
                    "size": 5
                }
            }
        },
        "_source": ["patent_index"]
    }
    
    response = es.search(index=index_name, body=query)
    logger.info(f"BM25 search returned {len(response['hits']['hits'])} hits")
    
    results = []
    for hit in response['hits']['hits']:
        patent_index = hit['_source']['patent_index']
        for inner_hit in hit['inner_hits']['chunks']['hits']['hits']:
            chunk = inner_hit['_source']
            results.append({
                "patent_id": str(patent_index),
                "chunk_index": chunk['chunk_index'],
                "text": chunk['text'],
                "score": inner_hit['_score']
            })
    
    return results

# Test semantic search
def test_semantic_search(index_name, query_embedding):
    logger.info("Testing semantic search")
    
    query = {
        "query": {
            "nested": {
                "path": "chunks",
                "query": {
                    "script_score": {
                        "query": {
                            "bool": {
                                "must": [
                                    {"term": {"chunks.is_abstract": True}},
                                    {"term": {"chunks.is_patentability": True}},
                                    {"term": {"chunks.is_claims": True}}
                                ]
                            }
                        },
                        "script": {
                            "source": "cosineSimilarity(params.query_vector, 'chunks.embedding') + 1.0",
                            "params": {"query_vector": query_embedding}
                        }
                    }
                },
                "inner_hits": {
                    "size": 5
                }
            }
        },
        "_source": ["patent_index"]
    }
    
    response = es.search(index=index_name, body=query)
    logger.info(f"Semantic search returned {len(response['hits']['hits'])} hits")
    
    results = []
    for hit in response['hits']['hits']:
        patent_index = hit['_source']['patent_index']
        for inner_hit in hit['inner_hits']['chunks']['hits']['hits']:
            chunk = inner_hit['_source']
            results.append({
                "patent_id": str(patent_index),
                "chunk_index": chunk['chunk_index'],
                "text": chunk['text'],
                "score": hit['_score']
            })
    
    return results

# Run tests
if __name__ == "__main__":
    # Setup test index and data
    test_index = setup_test_index()
    
    # Test queries
    test_query = "machine learning"
    test_embedding = [0.1] * 1536  # Simple test embedding
    
    print("\nTesting BM25 Search:")
    bm25_results = test_bm25_search(test_index, test_query)
    print(f"BM25 Results: {json.dumps(bm25_results, indent=2)}")
    
    print("\nTesting Semantic Search:")
    semantic_results = test_semantic_search(test_index, test_embedding)
    print(f"Semantic Results: {json.dumps(semantic_results, indent=2)}")

DEBUG:urllib3.connectionpool:http://localhost:9200 "HEAD /test_patent_index HTTP/11" 404 0
INFO:elastic_transport.transport:HEAD http://localhost:9200/test_patent_index [status:404 duration:0.005s]


DEBUG:urllib3.connectionpool:http://localhost:9200 "PUT /test_patent_index HTTP/11" 200 0
INFO:elastic_transport.transport:PUT http://localhost:9200/test_patent_index [status:200 duration:0.711s]
INFO:__main__:Created index: test_patent_index
DEBUG:urllib3.connectionpool:http://localhost:9200 "PUT /test_patent_index/_doc/1 HTTP/11" 201 0
INFO:elastic_transport.transport:PUT http://localhost:9200/test_patent_index/_doc/1 [status:201 duration:0.034s]
DEBUG:urllib3.connectionpool:http://localhost:9200 "POST /test_patent_index/_refresh HTTP/11" 200 0
INFO:elastic_transport.transport:POST http://localhost:9200/test_patent_index/_refresh [status:200 duration:0.103s]
INFO:__main__:Inserted test document
INFO:__main__:Testing BM25 search with query: machine learning
DEBUG:urllib3.connectionpool:http://localhost:9200 "POST /test_patent_index/_search HTTP/11" 200 None
INFO:elastic_transport.transport:POST http://localhost:9200/test_patent_index/_search [status:200 duration:0.012s]
INFO:__main__:


Testing BM25 Search:
BM25 Results: [
  {
    "patent_id": "TEST123",
    "chunk_index": 0,
    "text": "This is a test patent about machine learning technology.",
    "score": 1.8658177
  }
]

Testing Semantic Search:
Semantic Results: [
  {
    "patent_id": "TEST123",
    "chunk_index": 0,
    "text": "This is a test patent about machine learning technology.",
    "score": 1.999991
  },
  {
    "patent_id": "TEST123",
    "chunk_index": 1,
    "text": "The invention relates to artificial intelligence systems.",
    "score": 1.999991
  }
]
