This is a rule based model, which works to tackle some issues with working with LLMs such as hallucinations and reasoning. The main idea is that we will essentially have a dictionary that classifies each word from the query, and we sort of mechanically construct the query from there. How we do it? I'm not really sure yet. 

In [2]:
import os

# WIP!!!

In [None]:
from openai import OpenAI
from difflib import get_close_matches
import json
import re
import os

# Dummy data for entity types and relationship types in the linked database
ENTITY_TYPES = [
    "Contractor",
    "ConstructionProject",
    "Location",
    "Material",
    "Supplier",
    "LightBulbModel",
    "ConcreteType",
    "SteelType",
    "WoodType"
]
RELATIONSHIP_TYPES = [
    "constructed",
    "locatedAt",
    "usesMaterial",
    "suppliedBy",
    "managedBy",
    "sourcedFrom",
    "employs",
    "containsLightBulbModel",
    "containsConcreteType",
    "containsSteelType",
    "containsWoodType"
]

api_key = os.getenv('OPENAI_API_KEY')
organisation = os.getenv('ORGANISATION')
client = OpenAI(organization=organisation, api_key=api_key)

def find_closest_matches(query_terms, valid_terms):
    """
    Find the closest matches for query terms from a list of valid terms.
    """
    matches = {}
    for term in query_terms:
        close_match = get_close_matches(term, valid_terms, n=1, cutoff=0.5)
        if close_match:
            matches[term] = close_match[0]
    return matches

def extract_entities_and_relationships(nl_query):
    """
    Use GPT to extract entities and relationships from the natural language query.
    """
    prompt = f"""
    Extract the entities, filters, and relationships from the following construction-related query and return a sparql query that can be used to query the linked database:
    "{nl_query}"
    Provide a JSON object with three keys: `entities`, `filters`, and `relationships`. 
    - `entities` is a list of entity types mentioned.
    - `filters` is an object with keys `attribute`, `value`, and `entity`.
    - `relationships` is a list of objects with keys `from`, `to`, and `relationship`.
    - `Here is a list of entities and relationships in the database, so use those to find the closest related ones`: entitites: "{ENTITY_TYPES}", relationships: "{RELATIONSHIP_TYPES}
    - ``
    
    "
    """

    response = client.chat.completions.create(
        model="o1-mini",
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    choice_str = response.choices[0].message.content
    json_match = re.search(r'\{.*\}', choice_str, re.DOTALL)

    if json_match:
        # Extract the JSON string
        json_string = json_match.group(0)
        
        # Parse the JSON string into a Python dictionary
        choice_object = json.loads(json_string)
        
        return choice_object
    else:
        return {"entities": [], "filters": {}, "relationships": []}

def construct_sparql_query(nl_query):
    """
    Construct a SPARQL query from a natural language query.
    """
    # Extract entities and relationships using GPT
    extracted_data = extract_entities_and_relationships(nl_query)
    print(extracted_data)

    entities = extracted_data.get('entities', [])
    filters = extracted_data.get('filters', {})
    relationships = extracted_data.get('relationships', [])
    
    sparql_query = "SELECT ?subject ?predicate ?object WHERE {\n"
    
    for entity in entities:
        entity_type = find_closest_matches([entity], ENTITY_TYPES).get(entity, entity)
        sparql_query += f"  ?subject rdf:type :{entity_type} .\n"
    
    for relationship in relationships:
        from_entity = find_closest_matches([relationship['from']], ENTITY_TYPES).get(relationship['from'], relationship['from'])
        to_entity = find_closest_matches([relationship['to']], ENTITY_TYPES).get(relationship['to'], relationship['to'])
        relationship_type = find_closest_matches([relationship['relationship']], RELATIONSHIP_TYPES).get(relationship['relationship'], relationship['relationship'])
        sparql_query += f"  ?{from_entity} :{relationship_type} ?{to_entity} .\n"
    
    if filters:
        attribute = filters.get('attribute')
        value = filters.get('value')
        entity = filters.get('entity')
        entity_type = find_closest_matches([entity], ENTITY_TYPES).get(entity, entity)
        sparql_query += f"  ?{entity_type} :{attribute} \"{value}\" .\n"
    
    sparql_query += "}"
    
    return sparql_query

if __name__ == "__main__":
    nl_query = "buildings constructed in new york after 2009"
    sparql_query = construct_sparql_query(nl_query)
    print("Generated SPARQL Query:\n", sparql_query)
