# Setup and imports

In [1]:
import re
from pprint import pprint
import json
from langchain_ollama import OllamaLLM
llama32 = OllamaLLM(model="llama3.2:3b")



In [2]:
from langchain_openai import AzureChatOpenAI
import os
from dotenv import load_dotenv
load_dotenv()
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")

AZURE_DEPLOYMENT_GPT41 = os.getenv("AZURE_DEPLOYMENT_GPT41")
AZURE_DEPLOYMENT_GPT41_NANO = os.getenv("AZURE_DEPLOYMENT_GPT41_NANO")

gpt41_nano = AzureChatOpenAI(
    deployment_name=AZURE_DEPLOYMENT_GPT41_NANO,
    api_version=AZURE_OPENAI_API_VERSION,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    openai_api_key=AZURE_OPENAI_API_KEY,
)

# Data Structures

## Database Schema

In [3]:
def parse_mysql_ddl_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        ddl = f.read()

    # Remove comments and MySQL directives
    ddl = re.sub(r'/\*.*?\*/', '', ddl, flags=re.DOTALL)
    ddl = re.sub(r'--.*?$', '', ddl, flags=re.MULTILINE)
    ddl = re.sub(r'/\!.*?\*/;', '', ddl, flags=re.DOTALL)
    ddl = re.sub(r'/\!.*?\*/', '', ddl, flags=re.DOTALL)

    # Find all CREATE TABLE statements (handles backticks and multiline)
    table_regex = re.compile(
        r'CREATE TABLE\s+`?(\w+)`?\s*\((.*?)\)\s*ENGINE=.*?;',
        re.DOTALL | re.IGNORECASE
    )
    tables = table_regex.findall(ddl)
    result = {}

    for table_name, table_body in tables:
        # Raw DDL
        raw_ddl = f"CREATE TABLE `{table_name}` ({table_body});"

        # Split lines, remove empty and trailing commas
        lines = [line.strip().rstrip(',') for line in table_body.splitlines() if line.strip()]
        columns = []
        primary_keys = []
        foreign_keys = []

        for line in lines:
            # Column definition (starts with backtick or word, not constraint)
            if re.match(r'^`?\w+`?\s', line) and not line.upper().startswith(('PRIMARY KEY', 'FOREIGN KEY', 'CONSTRAINT', 'UNIQUE', 'KEY')):
                col_name = re.match(r'^`?(\w+)`?', line).group(1)
                columns.append(col_name)
            # Primary key
            elif line.upper().startswith('PRIMARY KEY'):
                pk_match = re.search(r'\((.*?)\)', line)
                if pk_match:
                    pk_cols = [col.strip(' `') for col in pk_match.group(1).split(',')]
                    primary_keys.extend(pk_cols)
            # Foreign key
            elif line.upper().startswith('CONSTRAINT') and 'FOREIGN KEY' in line.upper():
                fk_match = re.search(r'FOREIGN KEY\s*\((.*?)\)\s*REFERENCES\s*`?(\w+)`?\s*\((.*?)\)', line, re.IGNORECASE)
                if fk_match:
                    fk_cols = [col.strip(' `') for col in fk_match.group(1).split(',')]
                    ref_table = fk_match.group(2)
                    ref_cols = [col.strip(' `') for col in fk_match.group(3).split(',')]
                    foreign_keys.append({
                        'columns': fk_cols,
                        'ref_table': ref_table,
                        'ref_columns': ref_cols
                    })

        result[table_name] = {
            'columns': columns,
            'primary_keys': primary_keys,
            'foreign_keys': foreign_keys
        }

    return result

In [4]:
schema = parse_mysql_ddl_file('../../database/schema/usable_schema.sql')
pprint(schema)

{'granter_activity': {'columns': ['id',
                                  'created_at',
                                  'updated_at',
                                  'type',
                                  'title',
                                  'description',
                                  'application_id',
                                  'company_id',
                                  'file_id',
                                  'profile_id',
                                  'created_by_expert',
                                  'activity_date',
                                  'data',
                                  'data_id',
                                  'opportunity_id',
                                  'data_type'],
                      'foreign_keys': [{'columns': ['application_id'],
                                        'ref_columns': ['id'],
                                        'ref_table': 'granter_application'},
                                 

## Descriptions of Database tables

In [None]:
with open('../../database/README.md', 'r', encoding='utf-8') as f:
    readme_database_content = f.read()

print(readme_database_content)

# Granter Database Schema
This document explains the relationships between the main tables used in **Granter**, the AI-powered grant application platform.
---
## Tables and Relationships
### **granter_company**
- Represents a company that applies to funding opportunities.
- **Relations:**
  - Can create **granter_applications** for opportunities.
  - Has associated **granter_companyfiles** for company-related context.
  - Has associated **granter_companymemories** (vectorized text snippets for RAG).
---
### **granter_application**
- An application generated for a specific **company** and a specific **opportunity**.
- **Relations:**
  - Belongs to a **granter_company**.
  - Belongs to a **granter_opportunity**.
  - Has associated **granter_applicationfiles** (files relevant to this application).
---
### **granter_applicationfile**
- Files tied to a specific **application**.
- Usually provide company or project-specific context for that application.
- **Relations:**
  - Belongs to a **gr

## External Ontologies

In [4]:
with open("external_ontologies/DINGO-Manchester.omn", "r", encoding="utf-8") as f:
    external_ontology = f.read()
print(external_ontology)

Prefix: owl: <http://www.w3.org/2002/07/owl#>
Prefix: rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
Prefix: rdfs: <http://www.w3.org/2000/01/rdf-schema#>
Prefix: xml: <http://www.w3.org/XML/1998/namespace>
Prefix: xsd: <http://www.w3.org/2001/XMLSchema#>



Ontology: <https://w3id.org/dingo#>


Annotations: 
    <http://purl.org/dc/elements/1.1/contributor> _:0,
    <http://purl.org/dc/elements/1.1/contributor> _:1,
    <http://purl.org/dc/elements/1.1/contributor> _:2,
    <http://purl.org/dc/elements/1.1/contributor> _:4,
    <http://purl.org/dc/elements/1.1/contributor> _:5,
    <http://purl.org/dc/elements/1.1/issued> "2018-10-26T00:00:00Z",
    <http://purl.org/dc/elements/1.1/title> "DINGO Ontology",
    <http://purl.org/dc/terms/creator> _:3,
    <http://purl.org/dc/terms/modified> "2020-01-10T10:20:10Z",
    rdfs:comment "The DINGO ontology (Data Integration for Grant Ontology) defines the terms of the DINGO vocabulary and provides a machine readable extensible framework t

## Lexical Views

### Lex(S)

In [None]:
def extract_schema_lexical_view(schema_dict) -> list:
    lexical_view = []
    
    # Process tables in the order they appear in the dictionary
    for table_name in schema_dict.keys():
        table_data = schema_dict[table_name]
        
        # Add table name first
        lexical_view.append(table_name)
        
        # Add columns in the order they appear in the list
        for column in table_data['columns']:
            lexical_view.append(column)
        
        # Add primary keys in order (if any)
        for pk in table_data['primary_keys']:
            lexical_view.append(pk)
        
        # Add foreign key information in order (if any)
        for fk in table_data['foreign_keys']:
            lexical_view.append(fk['ref_table'])  # Referenced table name
            for ref_col in fk['ref_columns']:
                lexical_view.append(ref_col)
                
    # Remove duplicates while preserving order
    seen = set()
    unique_lexical_view = []
    for item in lexical_view:
        if item not in seen:
            seen.add(item)
            unique_lexical_view.append(item)
    return unique_lexical_view

In [30]:
extract_schema_lexical_view(schema)

['granter_activity',
 'id',
 'created_at',
 'updated_at',
 'type',
 'title',
 'description',
 'application_id',
 'company_id',
 'file_id',
 'profile_id',
 'created_by_expert',
 'activity_date',
 'data',
 'data_id',
 'opportunity_id',
 'data_type',
 'granter_application',
 'granter_company',
 'granter_companyfile',
 'granter_opportunity',
 'granter_profile',
 'state',
 'codename',
 'other_data',
 'profile_creator_id',
 'pricing_option',
 'ai_review_state',
 'consortium_id',
 'sale_confirmed',
 'approved_grant_amount',
 'success_fee',
 'success_payment_amount',
 'upfront_payment_amount',
 'initial_summary',
 'writer_mode',
 'granter_consortium',
 'granter_applicationfile',
 'name',
 'uploaded_file',
 'vector_indexed',
 'document_type_id',
 'granter_doctype',
 'legal_name',
 'address',
 'post_code',
 'city',
 'country',
 'url',
 'legal_form',
 'tax_id',
 'social_security_id',
 'is_sme',
 'employees_n',
 'annual_revenue',
 'cover_image',
 'website_data',
 'owner_id',
 'stripe_customer_id',

### Lex(O)

In [None]:
def extract_ontology_lexical_view(ontology_path: str) -> list:
    with open(ontology_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    lexical_view = []
    
    # Find all rdfs:label and rdfs:comment matches with their positions
    label_pattern = r'rdfs:label\s+"([^"]+)"'
    comment_pattern = r'rdfs:comment\s+"([^"]+)"'
    
    # Get all matches with their positions and type
    matches = []
    
    for match in re.finditer(label_pattern, content):
        matches.append((match.start(), match.group(1), 'label'))
    
    for match in re.finditer(comment_pattern, content):
        matches.append((match.start(), match.group(1), 'comment'))
    
    # Sort by position to maintain order
    matches.sort(key=lambda x: x[0])
    
    # Extract the text in order, handling spaces based on type
    for position, text, match_type in matches:
        if match_type == 'label':
            lexical_view.append(text.replace(" ", "")) 
        else: 
            lexical_view.append(text)  
    
    return lexical_view

In [None]:
display(extract_ontology_lexical_view('external_ontologies/DINGO-Manchester.omn'))

['The DINGO ontology (Data Integration for Grant Ontology) defines the terms of the DINGO vocabulary and provides a machine readable extensible framework to model data relative to projects, funding, project and funding actors, and, notably, funding policies. It is designed to yield high modeling power and elasticity to cope with the huge variety in funding and project practices, which makes it applicable to many areas where funding is an important aspect: first of all research, but also the arts, cultural conservation, and many others.',
 'Qualification awarded to the person taking the role by a higher education organisation, normally at a college or university.',
 'academic_degree',
 'An organization that this person is affiliated with. For example, a school/university, a club, or a team.',
 'affiliation',
 'Identifier (for example of Project, Grant, GrantPayment, PersonRole instance, OrganizationRole instance, FundingScheme, Person, Organisation) in the database/archive of the fundin

# RAG of Relevant Knowledge

## Chunking

## Embbeding amd Indexing

## Retrieval Functions

In [None]:
def docTable(relation: str):
    pass

def docAttr(relation: str, attribute: str):
    pass



# LLM Prompt Construction and Ontology Generation

In [None]:
table = schema['granter_activity']
core_ontology = ""

prompt = f"""
    Generate ontology elements with provenance annotations for database table granter_activity based on:

    [CONTEXT]
    - Database Schema of the table {json.dumps(table)}
    - Take semantics from the Existing Ontology Knowledge {core_ontology}

    [INSTRUCTIONS]
    1. Include these elements:
        Classes (subclass of Thing)
        Data properties with domain/range
        Object properties with domain/range
        Use only one rdfs:domain and one rdfs:range per property. If multiple options exist, select the most general or create a shared superclass.
    3. Do not create a property named "is". Use rdf:type for instance membership, rdfs:subClassOf for class hierarchies, and owl:sameAs for instance equality.
    4. Use this format example:

    Class: granter_activity
    Annotations:
    prov:wasDerivedFrom
    <http://example.org/provenance/granter_activity>

    DataProperty:
    has_column_name
    domain granter_activity
    range string
    Annotations:
    prov:wasDerivedFrom
    <http://example.org/provenance/granter_activity/column_name>

    ObjectProperty:
    relates_to_table domain granter_activity
    range RelatedTable
    Annotations:
    prov:wasDerivedFrom
    <http://example.org/provenance/granter_activity/fk_column>

    Only output Manchester Syntax and nothing else. [OUTPUT]
"""

In [None]:
delta_ontology = llama32.invoke(prompt)
print(delta_ontology)

# Validation and Refinement 

In [None]:
evaluator_prompt = f"""
You are an expert in OWL 2 DL ontology modeling and validation.

Your task is to review the following delta ontology fragment generated from a relational database table, along with its schema and relevant context.

[DELTA-ONTOLOGY]
{delta_ontology}

[DATABASE SCHEMA]
{table}

[CORE ONTOLOGY CONTEXT]
(empty as of now)

[VALIDATION CRITERIA]
1. **Coherence with Core Ontology**  
   - Do NOT redefine an existing class, property, or concept already present in the core ontology with the same meaning.
   - Reuse existing ontology elements where possible instead of creating duplicates.

2. **Alignment with Input Table Schema**  
   - Every significant column and foreign key in the table must be represented as an appropriate ontology element (class, data property, or object property).
   - Naming should reflect the database semantics clearly and consistently.

3. **Syntactic Validity**  
   - The ontology must conform to the OWL 2 DL profile and valid Manchester Syntax.
   - Only one `rdfs:domain` and one `rdfs:range` per property.

4. **Logical Consistency**  
   - No contradictory class axioms or property constraints.
   - No circular subclass relationships.
   - Correct choice between object properties and data properties.

5. **Clarity and Naming Quality**  
   - Use self-explanatory, domain-relevant names.
   - Avoid generic or meaningless labels (e.g., "Entity1", "PropertyA").
   - All properties should follow consistent naming patterns (e.g., `has_`, `is_...Of`).

[YOUR TASK]
- Check the delta ontology fragment against all criteria above.
- If issues are found, provide a corrected version of the ontology in valid Manchester Syntax.
- Make minimal necessary changes to preserve the author's intent while ensuring correctness and OWL 2 DL compliance.
- Ensure all elements keep their provenance annotations.

[OUTPUT FORMAT]
Respond ONLY with:
1. "Status: PASS" if the ontology fragment meets all criteria, or "Status: FAIL" if it does not.
2. If FAIL, provide:
   a. A short bullet list of the issues found.
   b. A corrected Manchester Syntax version of the ontology fragment.

Do NOT include any other commentary outside this format.
"""

In [None]:
revision = llama32.invoke(evaluator_prompt)
print(revision)

# Iterative Integration and Completion