## A simple ontology alignment framework in python: using lexical similarity and LLMs

All Transformers Pipeline options

pipeline = [
    'audio-classification', 'automatic-speech-recognition', 'conversational', 'depth-estimation', 'document-question-answering', 'feature-extraction', 'fill-mask', 'image-classification', 'image-feature-extraction', 'image-segmentation', 'image-to-image', 'image-to-text', 'mask-generation', 'ner', 'object-detection', 'question-answering', 'sentiment-analysis', 'summarization', 'table-question-answering', 'text-classification', 'text-generation', 'text-to-audio', 'text-to-speech', 'text2text-generation', 'token-classification', 'translation', 'video-classification', 'visual-question-answering', 'vqa', 'zero-shot-audio-classification', 'zero-shot-classification', 'zero-shot-image-classification', 'zero-shot-object-detection', 'translation_XX_to_YY']

## Packages:

- pip3 install torch torchvision torchaudio
- pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
- pip3 install transformers
- pip3 install ipywidgets
- pip3 install rdflib


Embedding, labbeling

task conference, anatonomy

In [204]:
import ipywidgets
import os
import torch
import rdflib
from rdflib.namespace import RDFS
from collections import Counter
from transformers import BertModel, AutoTokenizer, pipeline

In [205]:
models_path = os.getcwd() + "\\models"
tokens_path = os.getcwd() + "\\tokens"

In [206]:

bert_base_cased_model = BertModel.from_pretrained("bert-base-uncased")
bert_base_cased_model.save_pretrained(models_path)

# bert_sequence_classification = BertModel.from_pretrained("BertForSequenceClassification")
# bert_sequence_classification.save_pretrained("C:\\Faculdade\\Redes_De_Conhecimento\\RDProject\\models")


bert_base_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_base_tokenizer.save_pretrained(tokens_path)

# bert_sequence_classification_tokenizer = AutoTokenizer.from_pretrained("BertForSequenceClassification")
# bert_sequence_classification_tokenizer.save_pretrained("C:\\Faculdade\\Redes_De_Conhecimento\\RDProject\\tokens")

('c:\\Users\\tomas\\Documents\\faculdade\\mestrado\\2ºsemestre\\Redes_de_conhecimento\\RDProject\\tokens\\tokenizer_config.json',
 'c:\\Users\\tomas\\Documents\\faculdade\\mestrado\\2ºsemestre\\Redes_de_conhecimento\\RDProject\\tokens\\special_tokens_map.json',
 'c:\\Users\\tomas\\Documents\\faculdade\\mestrado\\2ºsemestre\\Redes_de_conhecimento\\RDProject\\tokens\\vocab.txt',
 'c:\\Users\\tomas\\Documents\\faculdade\\mestrado\\2ºsemestre\\Redes_de_conhecimento\\RDProject\\tokens\\added_tokens.json',
 'c:\\Users\\tomas\\Documents\\faculdade\\mestrado\\2ºsemestre\\Redes_de_conhecimento\\RDProject\\tokens\\tokenizer.json')

In [207]:
# Step 2: Load Ontologies
def load_ontology(file_path):
    g = rdflib.Graph()
    g.parse(file_path)
    return g

ontology1 = load_ontology(os.getcwd()+"\\anatomy-dataset\\anatomy-dataset\\human.owl")
ontology2 = load_ontology(os.getcwd()+"\\anatomy-dataset\\anatomy-dataset\\mouse.owl")

In [208]:
# Function to extract labels from ontology
def extract_labels(ontology):
    labels_dict = {}
    owlClass = rdflib.namespace.OWL.Class
    rdfType = rdflib.namespace.RDF.type

    for s in ontology.subjects(predicate=rdfType, object=owlClass):
        # Check if label is present locally
        label = None
        for o in ontology.objects(subject=s, predicate=RDFS.label):
            if isinstance(o, rdflib.Literal):
                label = str(o)
                break
        if label:
            class_uri = str(s)
            if class_uri in labels_dict:
                labels_dict[class_uri].append(label)
            else:
                labels_dict[class_uri] = [label]
        else:
            # Load external ontology to find label
            for o in ontology.objects(subject=s, predicate=rdflib.namespace.RDFS.seeAlso):
                g_external = rdflib.Graph()
                g_external.parse(str(o))
                for o_external in g_external.objects(subject=s, predicate=RDFS.label):
                    if isinstance(o_external, rdflib.Literal):
                        label_external = str(o_external)
                        class_uri = str(s)
                        if class_uri in labels_dict:
                            labels_dict[class_uri].append(label_external)
                        else:
                            labels_dict[class_uri] = [label_external]
                        break  # Exit loop after finding the label
                    break  # Exit loop after finding the label
    return labels_dict

# Extract labels for ontology1 and ontology2
labels_dict1 = extract_labels(ontology1)
labels_dict2 = extract_labels(ontology2)

# Function to select primary label for each class
def select_primary_label(labels):
    label_counts = Counter(labels)
    primary_label = label_counts.most_common(1)[0][0] if label_counts else None
    return primary_label

# Apply label selection to each ontology
primary_labels_dict1 = {class_uri: select_primary_label(labels) for class_uri, labels in labels_dict1.items()}
primary_labels_dict2 = {class_uri: select_primary_label(labels) for class_uri, labels in labels_dict2.items()}

In [209]:
# Define a function to generate BERT embeddings for a list of labels
def generate_bert_embeddings(labels, model, tokenizer):
    labels_list = list(labels.values())

    # Tokenize the labels
    tokenized_labels = tokenizer(labels_list, padding=True, truncation=True, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**tokenized_labels)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling over tokens
    
    return embeddings

# Example usage with reduced batch size:
embeddings_ontology1 = generate_bert_embeddings(primary_labels_dict1, bert_base_cased_model, bert_base_tokenizer)
embeddings_ontology2 = generate_bert_embeddings(primary_labels_dict2, bert_base_cased_model, bert_base_tokenizer)