# 1: load and explore the TTL file

In [9]:
from rdflib import Graph

# Load the TTL file
g = Graph()
g.parse("data/Industry_Demos_-_Energy_Objects_NEN2660_2_2024-10-01_1354.ttl", format="ttl")

# Print a few triples to understand the structure
for s, p, o in list(g)[:10]:
    print(f"Subject: {s}, Predicate: {p}, Object: {o}")


Subject: nd73dc6f484db4f83b5620b8f6b802449b254, Predicate: http://www.w3.org/ns/shacl#maxCount, Object: 1
Subject: nd73dc6f484db4f83b5620b8f6b802449b63, Predicate: http://www.w3.org/2002/07/owl#onClass, Object: http://hub.laces.tech/semmtech/consultancy/demonstrations/industries/energy/otl/industry-demos---energy-objects-nen2660/6b6bea4b-2d9b-39f5-aeb9-8614a1a1d191
Subject: http://hub.laces.tech/semmtech/consultancy/demonstrations/industries/energy/otl/industry-demos---energy-objects-nen2660/77f5c4fd-0e13-3b55-af57-25b2cb256d0c, Predicate: http://www.w3.org/2000/01/rdf-schema#subClassOf, Object: http://hub.laces.tech/semmtech/consultancy/demonstrations/industries/energy/otl/industry-demos---energy-objects-nen2660/4cc6225d-0cbe-34b3-a646-c9149f4ddce7
Subject: http://hub.laces.tech/semmtech/consultancy/demonstrations/industries/energy/otl/industry-demos---energy-objects-nen2660/, Predicate: http://purl.org/dc/terms/date, Object: 2024-08-01182
Subject: nd73dc6f484db4f83b5620b8f6b802449b44

# 2: Generate synthetic NLQ-SPARQL pairs

In [10]:
import rdflib

# Function to generate synthetic NLQ-SPARQL pairs
def generate_synthetic_data(graph, num_samples=100):
    nlq_sparql_pairs = []

    for s, p, o in list(graph)[:num_samples]:
        subject_label = s.split("/")[-1] if isinstance(s, rdflib.URIRef) else "Entity"
        predicate_label = p.split("#")[-1]

        # Create questions based on observed predicates
        if "prefLabel" in predicate_label:
            nlq = f"What is the label of {subject_label}?"
            sparql = f"SELECT ?label WHERE {{ <{s}> <{p}> ?label }}"
        elif "type" in predicate_label:
            nlq = f"What type is {subject_label}?"
            sparql = f"SELECT ?type WHERE {{ <{s}> <{p}> ?type }}"
        elif "hasPart" in predicate_label:
            nlq = f"What parts does {subject_label} have?"
            sparql = f"SELECT ?part WHERE {{ <{s}> <{p}> ?part }}"
        else:
            continue

        nlq_sparql_pairs.append((nlq, sparql))

    return nlq_sparql_pairs

# Generate synthetic data
synthetic_data = generate_synthetic_data(g)
print(synthetic_data[:5])


[('What type is def#RealObject?', 'SELECT ?type WHERE { <https://w3id.org/nen2660/def#RealObject> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?type }'), ('What type is 3ad24e46-1418-360b-8ea1-4662a300bd1d-Shape?', 'SELECT ?type WHERE { <http://hub.laces.tech/semmtech/consultancy/demonstrations/industries/energy/otl/industry-demos---energy-objects-nen2660/3ad24e46-1418-360b-8ea1-4662a300bd1d-Shape> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?type }'), ('What type is Entity?', 'SELECT ?type WHERE { <nd73dc6f484db4f83b5620b8f6b802449b51> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?type }'), ('What type is Entity?', 'SELECT ?type WHERE { <nd73dc6f484db4f83b5620b8f6b802449b96> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?type }'), ('What type is 2bb708d3-e303-36c3-8d74-9d4b3b12cda5?', 'SELECT ?type WHERE { <http://hub.laces.tech/semmtech/consultancy/demonstrations/industries/energy/otl/industry-demos---energy-objects-nen2660/2bb708d3-e303-36c3-8d74-9d4b3b12cda5> <h

In [11]:
synthetic_data

[('What type is def#RealObject?',
  'SELECT ?type WHERE { <https://w3id.org/nen2660/def#RealObject> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?type }'),
 ('What type is 3ad24e46-1418-360b-8ea1-4662a300bd1d-Shape?',
  'SELECT ?type WHERE { <http://hub.laces.tech/semmtech/consultancy/demonstrations/industries/energy/otl/industry-demos---energy-objects-nen2660/3ad24e46-1418-360b-8ea1-4662a300bd1d-Shape> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?type }'),
 ('What type is Entity?',
  'SELECT ?type WHERE { <nd73dc6f484db4f83b5620b8f6b802449b51> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?type }'),
 ('What type is Entity?',
  'SELECT ?type WHERE { <nd73dc6f484db4f83b5620b8f6b802449b96> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?type }'),
 ('What type is 2bb708d3-e303-36c3-8d74-9d4b3b12cda5?',
  'SELECT ?type WHERE { <http://hub.laces.tech/semmtech/consultancy/demonstrations/industries/energy/otl/industry-demos---energy-objects-nen2660/2bb708d3-e303-36c3-8d74-9d

# 3: mask entities in NLQs

In [12]:
import re

# Function to mask entities in NLQs
def mask_entities(nlq, entities):
    masked_nlq = nlq
    for i, entity in enumerate(entities):
        masked_nlq = re.sub(entity, f"[ENT{i+1}]", masked_nlq)
    return masked_nlq

# Example usage
entities = ["Agent", "Convertor"]
nlq = "What is the label of Agent?"
masked_nlq = mask_entities(nlq, entities)
print(masked_nlq)  # Output: "What is the label of [ENT1]?"


What is the label of [ENT1]?


# 4: fine-tune a GPT-2 model

In [14]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments

# Load pre-trained GPT-2
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Tokenize the synthetic data
train_data = [{"input_text": mask_entities(nlq, ["Agent", "Convertor"]), "output_text": sparql}
              for nlq, sparql in synthetic_data]

# Tokenization function
def tokenize_function(example):
    return tokenizer(example["input_text"], text_target=example["output_text"], truncation=True)

# Fine-tuning setup
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`

In [15]:
import transformers
import torch
import accelerate

print(transformers.__version__)
print(torch.__version__)
print(accelerate.__version__)



4.47.0
2.5.1
1.2.1


# 5: post-process generated SPARQL queries

In [None]:
def replace_masked_entities(query, entity_map):
    for mask, entity in entity_map.items():
        query = query.replace(mask, entity)
    return query

# Example usage
generated_query = "SELECT ?label WHERE { [ENT1] <http://www.w3.org/2004/02/skos/core#prefLabel> ?label }"
entity_map = {"[ENT1]": "<http://hub.laces.tech/semmtech/consultancy/demonstrations/industries/energy/otl/industry-demos---energy-objects-nen2660/Agent>"}
final_query = replace_masked_entities(generated_query, entity_map)
print(final_query)
