In [1]:
from ontology_generation.Evaluation import Evaluation
from ontology_generation.OntologyGen import OntologyGen
from ontology_generation.OntologyEncap import OntologyEncap
import re
import pandas as pd
from sentence_transformers import SentenceTransformer
import os
from rdflib import Namespace, Graph

In [2]:
ontogen = OntologyGen(model_name="gpt-4-1106-preview", deployment_name="gpt_chat_test_preview")
topic_name = "natural language processing"

#### CSO

In [3]:
# Evaluation
file_path = "data/CSO.3.3.ttl"
parent_topic_uri = "https://cso.kmi.open.ac.uk/topics/natural_language_processing"
deduplicated_topics = Evaluation.extract_concepts_and_deduplicate(file_path, parent_topic_uri)

In [4]:
cso_concepts = Evaluation.clean_concept_names(deduplicated_topics)
print("Number of concepts in CSO: ", len(cso_concepts))

Number of concepts in CSO:  168


In [5]:
from difflib import SequenceMatcher

# Function to normalize topics
def normalize_topic(topic):
    # Convert to lowercase and replace hyphens with underscores
    topic = topic.lower().replace("-", "_")
    # Remove trailing 's' for plurals
    topic = re.sub(r's$', '', topic)
    # Remove parentheses and their contents
    topic = re.sub(r'\([^)]*\)', '', topic)
    # Remove any remaining special characters and extra whitespace
    topic = re.sub(r'[^a-z0-9_]', '', topic)
    return topic

# Function to check if two topics are similar based on a threshold
def are_similar(a, b, threshold=0.95):
    return SequenceMatcher(None, a, b).ratio() > threshold

# Set for normalized topics to track seen ones
seen = set()
# List to store unique topics
unique_topics = []

# First pass: Remove exact duplicates using normalization
for topic in cso_concepts:
    normalized = normalize_topic(topic)
    if normalized not in seen:
        seen.add(normalized)
        unique_topics.append(topic)

# Second pass: Check for similar topics using SequenceMatcher
final_topics = []
for topic in unique_topics:
    if not any(are_similar(normalize_topic(topic), normalize_topic(existing)) for existing in final_topics):
        final_topics.append(topic)

# Print the number of unique topics and the topics themselves
print(len(final_topics))
final_topics

159


['word embeddings',
 'translation quality',
 'web information extraction',
 'synsets',
 'text processing',
 'n-gram language models',
 'information extraction systems',
 'text clustering',
 'computational grammars',
 'semantic distance',
 'textual entailment',
 'word segmentation',
 'treebanks',
 'text feature',
 'information extraction techniques',
 'unit selection',
 'grammatical inferences',
 'information retrieval technology',
 'translation process',
 'translation systems',
 'cross language information retrieval',
 'computing with words',
 'part-of-speech tagging',
 'dialogue manager',
 'text representation',
 'text data',
 'text categorization',
 'product reviews',
 'free texts',
 'sentiment analysis',
 'sequence labeling',
 'bilingual corpora',
 'dependency parsing',
 'text classification methods',
 'electronic document',
 'source language',
 'topic model',
 'word processing',
 'pos tagging',
 'natural language generation',
 'inverse document frequency',
 'sentiment classificatio

In [6]:
# role = "You are an ontology engineer"
# prompt = f'''You are a model tasked with deleting duplicate topics from a list for ontology creation for the {topic_name} domain. 
# The list: ''' + ', '.join(unique_topics) + '''
# Return the response only in the dictionary format. Do not add new topics. Ensure proper use of quotations:
# {
# 'topic1',
# 'topic2
# }
# '''
# cso_list = ontogen.prompt_extract(role, prompt)

In [7]:
# print("Number of concepts in CSO: ", len(cso_list))
# cso_list

#### AutOnto with OA

In [8]:
# concept_uri = "http://fraunhofer.de/example/Natural_Language_Processing"
# graph = Graph()
# graph.parse("output/taxonomy_withOA.ttl", format="ttl")  # Load your RDF data

# descendants_withOA = Evaluation.get_descendants(concept_uri, graph)
# concepts_onto_withOA = Evaluation.clean_concept_names(descendants_withOA)

#### AutOnto w/o OA

In [9]:
concept_uri = "http://fraunhofer.de/example/Natural_language_processing"
file_path = "output/taxonomy.ttl"

graph = Graph()
graph.parse("output/taxonomy.ttl", format="ttl") 
print(len(graph)) # Load your RDF data

descendants_withoutOA = Evaluation.get_descendants(concept_uri, graph)
descendants_withoutOA

187
Total descendants found: 44


{'http://fraunhofer.de/example/Amazon_Reviews',
 'http://fraunhofer.de/example/Arabic_Language_Analysis',
 'http://fraunhofer.de/example/Clinical_Text_Analysis',
 'http://fraunhofer.de/example/Dialog_Systems',
 'http://fraunhofer.de/example/Ensemble-Hybrid_Model',
 'http://fraunhofer.de/example/Feature_Selection',
 'http://fraunhofer.de/example/Fuzzy_Integral_Classifier_Fusion',
 'http://fraunhofer.de/example/Fuzzy_Quantification_in_Hotel_Reviews',
 'http://fraunhofer.de/example/Handwritten_Word_Recognition',
 'http://fraunhofer.de/example/Image_Captioning',
 'http://fraunhofer.de/example/Language_Acquisition',
 'http://fraunhofer.de/example/Language_Analysis',
 'http://fraunhofer.de/example/Lexicon_Development',
 'http://fraunhofer.de/example/Linguistic_Analysis',
 'http://fraunhofer.de/example/Logic_Rules_in_Sentiment_Classification',
 'http://fraunhofer.de/example/Machine_Learning_Models_and_Approaches',
 'http://fraunhofer.de/example/Machine_Translation',
 'http://fraunhofer.de/exa

In [10]:
concepts_onto__withoutOA = Evaluation.clean_concept_names(descendants_withoutOA)
print("Number of concepts in OntoNLP with OA: ", len(concepts_onto__withoutOA))

Number of concepts in OntoNLP with OA:  44


### Comparison

In [11]:
evalinst = Evaluation()

cso_list_processed = evalinst.preprocess_list(final_topics)
# concepts_onto_withOA_processed = evalinst.preprocess_list(concepts_onto_withOA)
concepts_onto_withoutOA_processed = evalinst.preprocess_list(concepts_onto__withoutOA)

In [12]:
# Load a pre-trained SentenceTransformer model
whaleloops_model = SentenceTransformer("whaleloops/phrase-bert")

# Calculate metrics for preprocessed_list1
phrase_embeddings1 = whaleloops_model.encode(cso_list_processed)
reference_embedding = whaleloops_model.encode('natural-language-processing')
metrics1 = Evaluation.calculate_metrics(phrase_embeddings1, reference_embedding)

# # Calculate metrics for preprocessed_list2
# phrase_embeddings2 = whaleloops_model.encode(concepts_onto_withOA_processed)
# metrics2 = Evaluation.calculate_metrics(phrase_embeddings2, reference_embedding)

# Calculate metrics for preprocessed_list2
phrase_embeddings3 = whaleloops_model.encode(concepts_onto_withoutOA_processed)
metrics3 = Evaluation.calculate_metrics(phrase_embeddings3, reference_embedding)

In [13]:
# Create a list of dictionaries to store the metrics
data = [
    {
        "List": "CSO",
        "Number of Terms": len(cso_list_processed),
        **metrics1
    },
#     {
#         "List": "AutOnto with OA concepts",
#         "Number of Terms": len(concepts_onto_withOA_processed),
#         **metrics2
#     },
        {
        "List": "AutOnto without OA concepts",
        "Number of Terms": len(concepts_onto_withoutOA_processed),
        **metrics3
    }
]

# Create the DataFrame from the list of dictionaries
comparison = pd.DataFrame(data)

# Export the DataFrame to a CSV file
comparison.to_csv("output/metrics_comparison.csv", index=False)