In [None]:
# !pip install -r ../requirements.txt
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from neo4j import GraphDatabase
import numpy as np
import pandas as pd
import torch


In [None]:
# Initialize models
ner_model = pipeline('ner', model='dslim/bert-base-NER', aggregation_strategy="simple")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')  # Efficient embedding model

# Neo4j connection (modify with your credentials)
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "12345678"

def get_named_entities(user_input):
    """Extract named entities and return them as a list."""
    entities = ner_model(user_input)
    return [entity['word'] for entity in entities]

def get_query_embedding(query):
    """Generate query embedding."""
    return embedding_model.encode(query).tolist()  # Convert NumPy array to list for Neo4j


def fetch_documents(entities, query_embedding):
    """Fetch relevant documents using Neo4j, considering both named entities and embeddings."""
    with GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD)) as driver:
        with driver.session() as session:
            # Step 1: Find documents mentioning extracted entities
            entity_query = """
            MATCH (d:Document)-[:HAS_ENTITY]->(e:NamedEntity)
            WHERE any(entity in $entities WHERE e.text CONTAINS entity)
            RETURN d.id AS doc_id, d.text AS text, d.embedding AS embedding, COUNT(e) AS entity_score
            ORDER BY entity_score DESC
            LIMIT 50
            """
            results = session.run(entity_query, entities=entities)

            # Step 2: Use Neo4j GDS to compute cosine similarity
            vector_query = """
            CALL db.index.vector.queryNodes('document_embedding_index', 10, $query_embedding)
            YIELD node, score
            RETURN node.text AS text, score
            """
            vector_results = session.run(vector_query, query_embedding=query_embedding)

            # Step 3: Merge Entity + Vector Scores
            docs = []
            entity_scores = {record["text"]: record["entity_score"] for record in results}

            for record in vector_results:
                doc_text = record["text"]
                similarity_score = record["score"]
                entity_score = entity_scores.get(doc_text, 0)

                # Weighted Score: 70% cosine similarity, 30% entity match count
                final_score = 0.7 * similarity_score + 0.3 * (entity_score / max(1, len(entities)))
                docs.append((doc_text, final_score))

            # Sort documents by final score
            docs.sort(key=lambda x: x[1], reverse=True)
            return docs[:10]  # Return top 10 results


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


In [19]:
def semantic_search(user_query):
    """Perform full semantic search combining NER and embeddings."""
    named_entities = get_named_entities(user_query)
    print(f"Named Entities: {named_entities}")
    query_embedding = get_query_embedding(user_query)
    query_embedding = [float(x) for x in query_embedding]  # Ensure all values are floats

    print(f"Query Embedding: {len(query_embedding)}")  
    print(query_embedding)  
    results = fetch_documents(named_entities, query_embedding)

    return results

# Example Query
query = "Show images of Queen Victoria Road, High Wycombe"
search_results = semantic_search(query)
print(search_results)
# Display results
for i, (doc_text, score) in enumerate(search_results):
    print(f"{i+1}. {doc_text} (Score: {score:.4f})")

Named Entities: ['Queen Victoria Road', 'High Wycombe']
Query Embedding: 384
[0.047259919345378876, -0.0050254100933671, 0.009446999058127403, 0.011550040915608406, 0.0034488989040255547, 0.04406385123729706, -0.06571777909994125, -0.03887023776769638, -0.06889384239912033, 0.014063204638659954, -0.008429056964814663, -0.040087293833494186, 0.036964550614356995, 0.013171747326850891, -0.014457369223237038, 0.061966486275196075, 0.009221840649843216, 0.008497585542500019, 0.021502012386918068, 0.02512035146355629, 0.049040861427783966, -0.017483001574873924, 0.015357885509729385, -0.03550645709037781, -0.10474926978349686, 0.03017442114651203, -0.014687899500131607, 0.10720934718847275, 0.013164877891540527, -0.0346587672829628, -0.020656226202845573, -0.008129528723657131, 0.03356825187802315, 0.03567840903997421, 0.06149589270353317, 0.02192855067551136, 0.04903529956936836, -0.02640468254685402, 0.038084451109170914, -0.02150961011648178, -0.00877613015472889, -0.02761322818696499, 0