In [None]:
import requests

url = "https://www.ncbi.nlm.nih.gov/research/pubtator3-api/publications/pmc_export/biocxml?pmcids=PMC9128899,PMC2927683"

response = requests.get(url)

if response.status_code == 200:
    with open("./dataset/paper.txt", "w", encoding="utf-8") as file:
        file.write(response.text)
    print("Data successfully saved to output.txt")
else:
    print(f"Failed to fetch data. Status code: {response.status_code}")

In [1]:
import os
import json
from minirag.utils import xml_to_json
from neo4j import GraphDatabase
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors
import Levenshtein
import networkx as nx
from collections import defaultdict
from functools import lru_cache
import concurrent.futures
from transformers import pipeline
from concurrent.futures import ThreadPoolExecutor
from transformers import BioGptTokenizer, BioGptForCausalLM, AutoModelForCausalLM, AutoTokenizer

# Constants
WORKING_DIR = "./input"
BATCH_SIZE_NODES = 500
BATCH_SIZE_EDGES = 100

# Neo4j connection credentials (consider using environment variables for security)
NEO4J_URI = "neo4j+s://5117636b.databases.neo4j.io"
NEO4J_USERNAME = "neo4j"
NEO4J_PASSWORD = "VcSOoWioxodkP0VXM-JVkYbn6SN39bCsfkJbwMeoXSc"
def load_graph_data(json_path):
    """Load JSON data from a file."""
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data.get('nodes', []), data.get('edges', [])

def normalize_name(name):
    """Normalize entity names for consistency."""
    return name.lower().replace('-', ' ').replace('_', ' ')

def compute_embeddings(entities):
    model = SentenceTransformer('FremyCompany/BioLORD-2023') #FremyCompany/BioLORD-2023  pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb
    texts = []
    for e in entities:
        # Use only the normalized name, omitting the description
        name = normalize_name(e['id'])
        desc = e.get('description', '')
        combined_text = f'{name} {desc}' if desc else name
        texts.append(name)
    embeddings = model.encode(texts, convert_to_numpy=True)
    return embeddings

def build_knn_graph(entities, embeddings, k=5, similarity_threshold=0.9):
    """
    Construct a k-NN graph using cosine similarity.
    Uses NearestNeighbors to avoid computing the full similarity matrix.
    """
    G = nx.Graph()
    # Add nodes with their attributes
    for entity in entities:
        G.add_node(entity['id'], entity_type=entity['entity_type'])
    
    # Use cosine distance (note: similarity = 1 - distance)
    nbrs = NearestNeighbors(n_neighbors=k+1, metric='cosine').fit(embeddings)
    distances, indices = nbrs.kneighbors(embeddings)

    for i, entity in enumerate(entities):
        for j, dist in zip(indices[i][1:], distances[i][1:]):  # Skip self (first element)
            score = 1 - dist
            if score > similarity_threshold:
                neighbor_id = entities[j]['id']
                G.add_edge(entity['id'], neighbor_id, weight=score)
    
    return G

def find_merge_candidates(G):
    """Identify merge candidate groups based on connected components."""
    components = list(nx.connected_components(G))
    return [list(component) for component in components if len(component) > 1]

def are_entities_similar_lev(entity1, entity2):
    """Dynamically adjust the Levenshtein threshold based on entity length."""
    len_avg = (len(normalize_name(entity1)) + len(normalize_name(entity2))) / 2
    threshold = 0.9 if len_avg < 10 else 0.8
    sim = Levenshtein.ratio(normalize_name(entity1), normalize_name(entity2))
    return sim >= threshold


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def log_decision(component, decision, explanation, log_file="llm_decisions_copy4.log"):
    """Log LLM decisions for review and feedback."""
    with open(log_file, "a") as f:
        f.write(f"\n\nComponent: {component}\nDecision: {decision}\nExplanation: {explanation}\n{'-'*50}\n")
        
model_name = "Horizon6957/DeepSeek-bio-vlarge-qna-cot"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# Set pad token if missing
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
# Initialize generator pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)  # Explicitly use GPU 0

@lru_cache(maxsize=128)
def cached_llm_decision(component_tuple, entities_dict):
    """Cached version of LLM decision to avoid redundant calls."""
    component = list(component_tuple)
    return llm_decision(component, entities_dict)


def llm_decision(component, entities_dict):

    prompt = f"""
You are an expert biologist specializing in entity resolution within biological networks. Your task is to determine whether the following entities represent the same biological concept and should therefore be merged. 
Consider the following criteria:
1. No dublicate should be extract like entity full name and its abbreviation that should be merge, no seperate extraction for them.
2. Do NOT group sub-types under broader categories. Instead, create separate nodes and establish relationships between them.
3. Spelling variations or formatting differences like plural or singular forms should be merge
Entities to evaluate:
"""
    for e in component:
        prompt += f"\nEntity: {e}\nDescription: {entities_dict[e].get('description', 'No description')}\nType: {entities_dict[e].get('entity_type', 'Unknown type')}\n"
    prompt += "\nShould these entities be merged? Please answer YES or NO and explain why."
    
    response = generator(prompt, max_new_tokens=512, truncation=True, do_sample=True)

    
    generated_txt = response[0]["generated_text"].strip().lower()

    decision = "yes" if "yes" in generated_txt else "no"

    return decision == "yes"

Device set to use cuda:0


In [3]:

# def parallel_llm_decisions(components, entities_dict, max_workers=5):
#     """Parallelize LLM calls to speed up processing."""
#     decisions = {}
#     with ThreadPoolExecutor(max_workers=max_workers) as executor:
#         future_to_component = {
#             executor.submit(cached_llm_decision, tuple(str(e) for e in comp), entities_dict): comp
#             for comp in components
#         }
#         for future in concurrent.futures.as_completed(future_to_component):
#             component = future_to_component[future]
#             try:
#                 decisions[tuple(str(e) for e in component)] = future.result()
#             except Exception as exc:
#                 print(f'Component {component} generated an exception: {exc}')
#     return decisions

def resolve_entities(entities, merge_candidates):
    entities_dict = {e['id']: e for e in entities}
    merged_entities = {}
    remaining_entities = set(e['id'] for e in entities)
    print("Using LLM for merging candidates...")
    
    # Use LLM to decide on each merge candidate group
    for group in merge_candidates:
        # Only process groups whose entities haven’t been merged yet
        if not any(entity in merged_entities.get(rep, []) for rep in merged_entities for entity in group):
            if llm_decision(group, entities_dict):
                print(f"LLM merge: {group}")
                main_entity = group[1]
                merged_entities[main_entity] = group
                remaining_entities -= set(group)
    
    print("Comparing merged entity representatives with one another using Levenshtein...")
    compared_pairs = set()
    
    # Compare merged entity representatives with one another using Levenshtein
    representatives = list(merged_entities.keys())
    for i in range(len(representatives)):
        for j in range(i + 1, len(representatives)):
            pair = tuple(sorted([representatives[i], representatives[j]]))
            if pair not in compared_pairs:
                compared_pairs.add(pair)
                if are_entities_similar_lev(entities_dict[representatives[i]]['id'], entities_dict[representatives[j]]['id']):
                    print(f"Levenshtein merge representatives: {representatives[j]} -> {representatives[i]}")
                    merged_entities[representatives[i]].extend(merged_entities[representatives[j]])
                    del merged_entities[representatives[j]]
    
    # Now, use Levenshtein distance for remaining entities
    print("Using Levenshtein distance for remaining entities...")
    representatives = list(merged_entities.keys())  # Update list again
    for rep in representatives:
        for entity in list(remaining_entities):
            if are_entities_similar_lev(entities_dict[rep]['id'], entities_dict[entity]['id']):
                print(f"Levenshtein merge remaining: {entity} -> {rep}")
                merged_entities[rep].append(entity)
                remaining_entities.remove(entity)
    
    return merged_entities

def create_entity_mapping(resolved_entities):
    """
    Create a mapping for every entity in a merged group to its representative.
    This mapping is then used to update both nodes and edges.
    """
    entity_mapping = {}
    for representative, group in resolved_entities.items():
        for entity in group:
            entity_mapping[entity] = representative
    return entity_mapping
def update_original_data(original_entities, entity_mapping):
    """
    Replace all occurrences of merged entities with their representative entity.
    """
    updated_entities = []
    for entity in original_entities:
        entity_id = entity['id']
        if entity_id in entity_mapping:
            entity['id'] = entity_mapping[entity_id]
        updated_entities.append(entity)
    return updated_entities
def save_updated_data(updated_nodes, updated_edges, json_path):
    """
    Save the resolved nodes and edges back to a JSON file.
    """
    updated_data = {
        "nodes": updated_nodes,
        "edges": updated_edges
    }
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(updated_data, f, ensure_ascii=False, indent=2)
    print(f"Updated data saved to {json_path}")
def convert_xml_to_json(xml_path, output_path):
    """
    Converts an XML file to JSON and saves the output.
    """
    if not os.path.exists(xml_path):
        print(f"Error: File not found - {xml_path}")
        return None
    json_data = xml_to_json(xml_path)
    if json_data:
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(json_data, f, ensure_ascii=False, indent=2)
        print(f"JSON file created: {output_path}")
        return json_data
    else:
        print("Failed to create JSON data")
        return None
def process_in_batches(tx, query, data, batch_size):
    """
    Process data in batches and execute the given Neo4j query.
    """
    for i in range(0, len(data), batch_size):
        batch = data[i : i + batch_size]
        if "nodes" in query:
            tx.run(query, {"nodes": batch})
        else:
            tx.run(query, {"edges": batch})
xml_file = os.path.join(WORKING_DIR, "graph_chunk_entity_relation.graphml")
json_file = os.path.join(WORKING_DIR, "graph_data.json")
    
# Convert XML to JSON
json_data = convert_xml_to_json(xml_file, json_file)

# 3. Load nodes and edges from JSON
nodes = json_data.get("nodes", [])
edges = json_data.get("edges", [])
# 4. Compute embeddings and build the k-NN graph for de-duplication
embeddings = compute_embeddings(nodes)
G = build_knn_graph(nodes, embeddings)
merge_candidates = find_merge_candidates(G)
resolved_entities = resolve_entities(nodes, merge_candidates)
entity_mapping = create_entity_mapping(resolved_entities)



Root element: {http://graphml.graphdrawing.org/xmlns}graphml
Root attributes: {'{http://www.w3.org/2001/XMLSchema-instance}schemaLocation': 'http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd'}
Found 335 nodes and 291 edges
JSON file created: ./input/graph_data.json
Using LLM for merging candidates...
LLM merge: ['DIFFUSE SYSTEMIC SCLEROSIS', 'SYSTEMIC SCLEROSIS', 'SYSTEMIC SCLEROSIS (SSC)']
LLM merge: ['SYSTEMIC LUPUS ERYTHEMATOSUS (SLE)', 'SYSTEMIC LUPUS ERYTHEMATOSUS']
LLM merge: ['T-CELL', 'T CELLS', 'T CELL']
LLM merge: ['AUTOIMMUNE DISEASE', 'AUTOIMMUNE DISEASES', 'AUTOIMMUNITY', 'AUTOIMMUNE-LIKE DISEASES', 'AUTOIMMUNE AND INFLAMMATORY CONDITIONS']
LLM merge: ['OX40L', 'OX40', 'SOLUBLE OX40L', 'OX40-OX40L', 'OXL40']
LLM merge: ['RS944648', 'RS855648', 'RS2004640', 'RS844665', 'RS10912580', 'RS2205960', 'RS1234214', 'RS844648', 'RS1234315', 'RS844644', 'RS1234314', 'RS12039904', 'RS2795288']
LLM merge: ['OX40 LIGAND', 'OX40/OX40 LIGAND']
LL

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


LLM merge: ['ANTI-TOPOISOMERASE I AUTOANTIBODIES', 'AUTOANTIBODIES TO TOPOISOMERASE I (ATA)', 'ANTI-TOPOISOMERASE I AUTOANTIBODY', 'ANTI-TOPOISOMERASE I ANTIBODIES', 'ANTI-TOPOISOMERASE I ANTIBODY']
LLM merge: ['ANTI-RNA POLYMERASE III AUTOANTIBODIES', 'ANTI-RNA POLYMERASE III ANTIBODIES', 'ANTI-RNA POLYMERASE III ANTIBODIES (ARA)']
LLM merge: ['SKIN INVOLVEMENT', 'CUTANEOUS INVOLVEMENT']
LLM merge: ['B CELLS', 'B CELL']
LLM merge: ['CD4+ T CELL', 'CD4 T CELL', 'CD4+ T CELLS']
LLM merge: ['CD8+ T CELLS', 'CD8+ T CELL']
LLM merge: ['PLASMACYTOID DENDRITIC CELLS', 'PLASMACYTOID DENDRITIC CELL']
LLM merge: ['SNPS', 'SNP']
LLM merge: ['TYPE I DIABETES', 'TYPE 1A DIABETES']
LLM merge: ['ASTHMA', 'ALLERGIC ASTHMA']
LLM merge: ['REGULATORY T-CELL', 'REGULATORY T-CELLS']
LLM merge: ['MEMORY T-CELLS', 'MEMORY T CELLS']
LLM merge: ['IL-4', 'IL-17']
LLM merge: ['SKIN INFLAMMATION', 'INFLAMMATORY SKIN DISEASE', 'DERMATITIS', 'ECZEMA', 'INFLAMMATORY SKIN DISEASES']
LLM merge: ['MAST CELLS', 'MAST C

In [4]:
# 5. Update nodes and edges using the entity mapping
updated_nodes = update_original_data(nodes, entity_mapping)
updated_edges = [
    {
        "source": entity_mapping.get(edge["source"], edge["source"]),
        "target": entity_mapping.get(edge["target"], edge["target"]),
        "weight": edge.get("weight", 1.0),
        "description": edge.get("description", ""),
        "keywords": edge.get("keywords", ""),
        "source_id": edge.get("source_id", "")
    }
    for edge in edges
]

# 6. Remove duplicate nodes based on their id
unique_nodes = list({node['id']: node for node in updated_nodes}.values())
    
# 7. Remove duplicate edges and aggregate properties
edge_dict = defaultdict(lambda: {"weight": 0, "description": "", "keywords": "", "source_id": ""})
for edge in updated_edges:
    key = (str(edge['source']), str(edge['target']))
    edge_dict[key]["weight"] += edge.get("weight", 1.0)
    edge_dict[key]["description"] = edge.get("description", "")
    edge_dict[key]["keywords"] = edge.get("keywords", "")
    edge_dict[key]["source_id"] = edge.get("source_id", "")
unique_edges = [
    {"source": source, "target": target, **properties}
    for (source, target), properties in edge_dict.items()
]
print(f"Final number of nodes after de-duplication: {len(unique_nodes)}")
print(f"Final number of edges after de-duplication: {len(unique_edges)}")
save_updated_data(unique_nodes, unique_edges, json_file)

Final number of nodes after de-duplication: 243
Final number of edges after de-duplication: 231
Updated data saved to ./input/graph_data.json


In [None]:
create_nodes_query = """
UNWIND $nodes AS node
MERGE (e:Entity {id: node.id})
SET e.entity_type = node.entity_type,
    e.description = node.description,
    e.source_id = node.source_id,
    e.displayName = node.id
REMOVE e:Entity
WITH e, node
CALL apoc.create.addLabels(e, [node.id]) YIELD node AS labeledNode
RETURN count(*)
"""
create_edges_query = """
UNWIND $edges AS edge
MATCH (source {id: edge.source})
MATCH (target {id: edge.target})
WITH source, target, edge,
        CASE
        WHEN edge.keywords CONTAINS 'lead' THEN 'lead'
        WHEN edge.keywords CONTAINS 'participate' THEN 'participate'
        WHEN edge.keywords CONTAINS 'uses' THEN 'uses'
        WHEN edge.keywords CONTAINS 'located' THEN 'located'
        WHEN edge.keywords CONTAINS 'occurs' THEN 'occurs'
        ELSE REPLACE(SPLIT(edge.keywords, ',')[0], '\"', '')
        END AS relType
CALL apoc.create.relationship(source, relType, {
    weight: edge.weight,
    description: edge.description,
    keywords: edge.keywords,
    source_id: edge.source_id
}, target) YIELD rel
RETURN count(*)
"""
set_displayname_and_labels_query = """
MATCH (n)
SET n.displayName = n.id
WITH n
CALL apoc.create.setLabels(n, [n.entity_type]) YIELD node
RETURN count(*)
"""
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))
try:
    with driver.session() as session:
            # Insert nodes in batches
        session.execute_write(process_in_batches, create_nodes_query, unique_nodes, BATCH_SIZE_NODES)
            # Insert edges in batches
        session.execute_write(process_in_batches, create_edges_query, unique_edges, BATCH_SIZE_EDGES)
            # Update displayName and labels
        session.run(set_displayname_and_labels_query)
except Exception as e:
        print(f"Error occurred: {e}")
finally:
    driver.close()