In [1]:
# !pip install -r ../requirements.txt
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from neo4j import GraphDatabase
import numpy as np
import pandas as pd
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Initialize models
ner_model = pipeline('ner', model='dslim/bert-base-NER', aggregation_strategy="simple")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')  # Efficient embedding model

# Neo4j connection (modify with your credentials)
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "12345678"

def get_named_entities(user_input):
    """Extract named entities and return them as a list."""
    entities = ner_model(user_input)
    return [entity['word'] for entity in entities]

def get_query_embedding(query):
    """Generate query embedding."""
    return embedding_model.encode(query).tolist()  # Convert NumPy array to list for Neo4j


def fetch_documents(entities, query_embedding):
    """Fetch relevant documents using Neo4j, considering both named entities and embeddings."""
    with GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD)) as driver:
        with driver.session() as session:

            # Step 1: Compute similarity with Document embeddings
            doc_query = """
            CALL db.index.vector.queryNodes('document_embedding_index', 50, $query_embedding)
            YIELD node AS doc, score AS doc_similarity
            RETURN doc.id AS doc_id, doc.text AS text, doc_similarity
            """
            doc_results = session.run(doc_query, query_embedding=query_embedding)

            doc_scores = {}
            for record in doc_results:
                doc_scores[record["doc_id"]] = {
                    "text": record["text"],
                    "doc_similarity": record["doc_similarity"],
                    "entity_similarity": 0
                }

            # Step 2: Compute similarity with NamedEntity embeddings
            entity_query = """
            CALL db.index.vector.queryNodes('entity_embedding_index', 50, $query_embedding)
            YIELD node AS entity, score AS entity_similarity
            MATCH (entity)<-[:HAS_ENTITY]-(doc:Document)
            RETURN doc.id AS doc_id, entity_similarity
            """
            entity_results = session.run(entity_query, query_embedding=query_embedding)

            for record in entity_results:
                doc_id = record["doc_id"]
                entity_similarity = record["entity_similarity"]
                if doc_id in doc_scores:
                    doc_scores[doc_id]["entity_similarity"] += entity_similarity
                else:
                    doc_scores[doc_id] = {
                        "text": "",
                        "doc_similarity": 0,
                        "entity_similarity": entity_similarity
                    }

            # Step 3: Calculate combined score
            combined_scores = []
            for doc_id, scores in doc_scores.items():
                print(f'Document ID: {doc_id}')
                print(f'Document Similarity: {scores["doc_similarity"]}')
                print(f'Entity Similarity: {scores["entity_similarity"]}')
                combined_score = (0.7 * scores["doc_similarity"]) + (0.3 * scores["entity_similarity"])
                combined_scores.append((scores["text"], combined_score))

            # Sort by combined scores
            combined_scores.sort(key=lambda x: x[1], reverse=True)

            return combined_scores[:10]  # Return top 10 results


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


In [16]:

def semantic_search(user_query):
    """Perform full semantic search combining NER and embeddings."""
    named_entities = get_named_entities(user_query)
    print(f"Named Entities: {named_entities}")

    query_embedding = get_query_embedding(user_query)
    query_embedding = [float(x) for x in query_embedding]  # Ensure all values are floats
    results = fetch_documents(named_entities, query_embedding)

    return results

# Example Query
query = "show images of Queen Victoria Road in 1942"
search_results = semantic_search(query)
# Display results
for i, (doc_text, score) in enumerate(search_results):
    print(f"{i+1}. {doc_text:10} (Score: {score:.4f})")


Named Entities: ['Queen Victoria Road']
Document ID: 7d0313d1-eba0-39e8-9956-24ac4dabaaff
Document Similarity: 0.8216197490692139
Entity Similarity: 0.8427379131317139
Document ID: a7bb9917-95ff-3f55-a640-4c5afcec25f2
Document Similarity: 0.8111860752105713
Entity Similarity: 1.6854758262634277
Document ID: 517460db-6a44-3837-8220-dc395ec7cbe7
Document Similarity: 0.7912728786468506
Entity Similarity: 0.7304196357727051
Document ID: c377ea96-b3f9-3ff5-ad46-38709f2553f6
Document Similarity: 0.7899045944213867
Entity Similarity: 0
Document ID: 8c93eb18-15a2-38b6-935b-645d9402dad0
Document Similarity: 0.782139778137207
Entity Similarity: 1.4231476783752441
Document ID: 8e634403-9597-3427-aa69-85bbc0cc9411
Document Similarity: 0.7818660736083984
Entity Similarity: 0.718273401260376
Document ID: 4715f4d7-623d-350f-bfd3-84591ae08136
Document Similarity: 0.7784600257873535
Entity Similarity: 1.4231476783752441
Document ID: 56c8cc10-fb58-33aa-bf6f-fb9dc6a78824
Document Similarity: 0.7784354686