In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import os
import yaml

In [3]:
ANNOTATED_DATA_PATH = "../data_store/test_data/new_labels/new_labels.xlsx"

In [4]:
first_mnemonics = pd.read_excel(ANNOTATED_DATA_PATH, sheet_name=1)
second_mnemonics = pd.read_excel(ANNOTATED_DATA_PATH, sheet_name=2)
third_mnemonics = pd.read_excel(ANNOTATED_DATA_PATH, sheet_name=3)

In [5]:
mnemonics = pd.concat([first_mnemonics, second_mnemonics, third_mnemonics], ignore_index=True)
mnemonics = mnemonics.drop_duplicates(subset=["Mnemonic"])
mnemonics.head(5)

Unnamed: 0,Mnemonic,Description in data log,PrototypeData_class (ddhub:),Correct ddhub term,In KB?,true label,Unit,Unit_class (ddhub:),Correct ddhub term.1,In KB?.1,true label.1,Quantity_class (ddhub:),Correct ddhub term.2,In KB?.2,true label.2,DataType,dataSource
0,SPM2,Pump 2 Stroke Rate,PumpRate,PumpRate,,PumpRate,1/min,Spm,Spm,,Spm,FrequencyQuantity,FrequencyQuantity,,FrequencyQuantity,Drilling,double
1,GS_TV05,MudPit Volume Average 5,ActiveVolume,,No,out-of-set,m3,CubicMetre,CubicMetre,,CubicMetre,VolumeQuantity,VolumeQuantity,,VolumeQuantity,MudLog,double
2,TOFB,zzz:undefined,,,?,uncertain,s,Second,Second,,Second,TimeQuantity,TimeQuantity,,TimeQuantity,General,double
3,BVEL,Block Velocity,HookVelocity,HookVelocity,,HookVelocity,m/s,MetrePerSecond,MetrePerSecond,,MetrePerSecond,VelocityQuantity,VelocityQuantity,,VelocityQuantity,Drilling,double
4,SPP/SPM2,zzz:undefined,,,No,rejected,unitless,Dimensionless,Dimensionless,,rejected,DimensionlessQuantity,DimensionlessQuantity|None,,rejected,General,double


In [6]:
# Input data are triples of (mnemonic, description, unit)
input_data = []
for _, row in mnemonics.iterrows():
    mnemonic = row["Mnemonic"]
    description = row["Description in data log"]
    unit = row["Unit"]
    input_data.append({
        "mnemonic": mnemonic,
        "description": description,
        "unit": unit
    })

# Save input data to a YAML file
output_file = "../data_store/test_data/Annotated data/input_data.yaml"
with open(output_file, 'w') as file:
    yaml.dump(input_data, file, allow_unicode=True)

In [7]:
# Label data are a dictionary with mnemonics as keys with Correct ddhub term for PrototypeData, Correct ddhub term.1 for Unit and Correct ddhub term.2 for Quantity
label_data = {}
mapping = {
    "uncertain" : "Uncertain",
    "out-of-set" : "OutOfSet",
    "rejected" : "OutOfSet"
}

for _, row in mnemonics.iterrows():
    mnemonic = row["Mnemonic"]
    prototypeData = row["Correct ddhub term"]
    prototype_data_true_label = row['true label'] if pd.notna(row['true label']) else "uncertain"
    unit = row["Correct ddhub term.1"]
    unit_true_label = row['true label.1'] if pd.notna(row['true label.1']) else "uncertain"
    quantity = row["Correct ddhub term.2"]
    quantity_true_label = row['true label.2'] if pd.notna(row['true label.2']) else "uncertain"

    label_data[mnemonic] = {
        "PrototypeData": str(prototypeData) if pd.notna(prototypeData) else mapping.get(prototype_data_true_label.lower(), "Uncertain") + "PrototypeData",
        "Unit": str(unit) if pd.notna(unit) else mapping.get(unit_true_label.lower(), "Uncertain") + "Unit",
        "Quantity": str(quantity) if pd.notna(quantity) else mapping.get(quantity_true_label.lower(), "Uncertain") + "Quantity"
    }

# Save label data to a YAML file
label_output_file = "../data_store/test_data/Annotated data/label_data.yaml"
with open(label_output_file, 'w') as file:
    yaml.dump(label_data, file, allow_unicode=True)

In [1]:
from openai import OpenAI
client = OpenAI()

response = client.responses.create(
    model="gpt-4o-mini",
    input=[
        {
        "role": "system",
        "content": [
            {
                "type": "input_text",
                "text": "# üõ†Ô∏è System Prompt (Drilling Mnemonics Dictionary, JSON Output)\n\nYou are a **Drilling Mnemonics Knowledge Extractor**.\nSearch authoritative drilling sources and provide concise **mnemonic ‚Üí structured definition** pairs. \n---\n\n## Rules\n\n### 1. Search-Only\n- Output only from **authoritative sources**.\n- **No invented definitions.**\n- If not found: `\"Definition\": \"undefined\"`, `\"Source\": \"undefined\"`.\n\n### 2. Output Format\n- Output a **single JSON object** where:\n  - Each mnemonic is a key; value is an object with:\n    - `\"Definition\"` (short, authoritative),\n    - `\"Unit\"` (from source or `\"undefined\"`),\n    - `\"Clarifications\"` (array or `[]`),\n    - `\"Source\"` (source name),\n\n### 3. Search Guidance\n- Use targeted queries for mnemonics or components at:\n  - site:slb.com, site:iadc.org, site:halliburton.com, site:spe.org\n- Prefer Schlumberger, IADC, Halliburton, SPE sources only.\n\n### 4. Units\n- Use **input unit**. Only add/convert units if source specifies.\n- Note unit clarifications in `\"Clarifications\"`.\n\n### 5. Example\n{\n  \"RPM\": {\n    \"Definition\": \"Revolutions Per Minute, usually Surface RPM unless specified.\",\n    \"Unit\": \"rpm\",\n    \"Clarifications\": [\"Not Downhole RPM (DRPM).\"],\n    \"Source\": \"Schlumberger Drilling Mnemonic Catalog\",\n  }\n}"
            }
        ]
    },
        {
        "role": "user",
        "content": [
            {
                "type": "input_text",
                "text": "Input:  { \"Mnemonic\": \"SPPA\", \"Description\": \"Average Standpipe Pressure\", \"Unit\": \"kPa\"}\nOutput: (json object)"
            }
        ]
    }  
],
    tools=[
    {
      "type": "web_search_preview",
      "search_context_size": "medium"
    }
  ],
  temperature=0.1,
  max_output_tokens=2048,
  top_p=1,
  store=True
)

In [2]:
import json
raw_response = response.output[-1].content[0].text
raw_response = raw_response.replace("```json", "").replace("```", "")
search_response = response.output[0].action
parsed_response = json.loads(raw_response)

In [3]:
parsed_response, search_response


({'SPPA': {'Definition': 'Average Standpipe Pressure, measured in kilopascals (kPa).',
   'Unit': 'kPa',
   'Clarifications': ['Measured at the standpipe manifold, just before the flex hose connecting to the swivel.'],
   'Source': 'undefined'}},
 {'type': 'search',
  'query': 'SPPA Average Standpipe Pressure site:slb.com OR site:iadc.org OR site:halliburton.com OR site:spe.org'})

In [1]:
import rdflib
import json
from openai import OpenAI
client = OpenAI()
FILES = [
    "data_store/DDHub_model/slb_scrape.ttl",
    "data_store/DDHub_model/petrospec_scrape.ttl"
]

KBs = [rdflib.Graph().parse(file, format="ttl") for file in FILES]

In [8]:
response = client.responses.create(
    model="gpt-4o-mini",
    input=[
        {
        "role": "user",
        "content": [
            {
                "type": "input_text",
                "text": """Task Objective: Output List of Mnemonic Components
    Do it step by step strictly:
    1. For Both Mnemonic and Unit:
        a. Split the mnemonic by special symbol like '_', '#'.
        b. Consider to split the mnemonic if there is a number suffix. It often means the index of the series data channel, such as 'SPM2' means 'the 2nd SPM data'. But the number can also be part of a name, such as 'C3' can mean 'Propane'.
        c. Split the mnemonic if it includes a combination of a number and a time unit as a suffix. This often represents a data filter, e.g., RPM30s means "the average RPM over 30 seconds".
        d. Split the mnemonic if it includes strings as prefix.
        e. Consider to split the mnemonic if some parts of the mnemonic can be considered as prefix or suffix suggesting collect operation. The most common are: T - Total, A - Average, M - Minimum, X - Maximum, C - Cumulative
            - For example THKD, it can be splitted into 'T' and 'HKD', as 'HKD' can be a drilling term 'Hookload' and 'T' stands for 'Total'.
    2. Output List of Mnemonic Components to be queried for. No Explanations. ('[ "THKD" , "T", "HKD", "kg"]')

    Input: { "Mnemonic": "SPPA", "Description": "Average Standpipe Pressure", "Unit": "kPa"}
    Output: (list of mnemonic components)
"""
            }
        ]
    },
],
  temperature=0.1,
  max_output_tokens=2048,
  top_p=1,
  store=True
)

In [9]:
output_list = json.loads(response.output[0].content[0].text)
output_list

['SPPA', 'SPP', 'A', 'kPa']

In [10]:
UNIT_QUERY_TEMPLATE = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX ddhub: <http://ddhub.no/>
SELECT DISTINCT ?unit ?description
WHERE {
  ?unit a ddhub:Unit ;
        ddhub:HasMnemonic "<mnemonic>" ;
        ddhub:HasDescription ?description .
}
"""

CHANNEL_QUERY_TEMPLATE = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX ddhub: <http://ddhub.no/>
SELECT DISTINCT ?channel ?description
WHERE {
  ?channel a ddhub:Channel ;
           ddhub:HasMnemonic "<mnemonic>" ;
           ddhub:HasDescription ?description .
}
"""

UNIT_TO_CHANNEL = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX ddhub: <http://ddhub.no/>
SELECT DISTINCT ?unit ?channel
WHERE {
  ?unit a ddhub:Unit ;
        ddhub:HasMnemonic "<mnemonic>" ;
        ddhub:IsUnitForChannel ?channel .
}
"""

CHANNEL_TO_UNIT = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX ddhub: <http://ddhub.no/>
SELECT DISTINCT ?channel ?unit
WHERE {
  ?channel a ddhub:Channel ;
           ddhub:HasMnemonic "<mnemonic>" ;
           ddhub:ChannelHasUnit ?unit .
}
"""


In [11]:
DDHUB = rdflib.Namespace("http://ddhub.no/") 
test_list = output_list
#test_list.extend(
#    [x.lower() for x in output_list]
#)
#test_list.extend(
#    [x.upper() for x in test_list]
#)

for kb in KBs:
    for output in test_list:
        kb_query = UNIT_QUERY_TEMPLATE.replace("<mnemonic>", output)
        # Execute the query against the knowledge base
        results = kb.query(kb_query)
        for row in results:
            print(f"Found unit: {row[0]}, description: {row[1]} in {kb.identifier}")

        kb_query = CHANNEL_QUERY_TEMPLATE.replace("<mnemonic>", output)
        results = kb.query(kb_query)
        for row in results:
            print(f"Found channel: {row[0]}, description: {row[1]} in {kb.identifier}")

        kb_query = UNIT_TO_CHANNEL.replace("<mnemonic>", output)
        results = kb.query(kb_query)
        for row in results:
            print(f"Found unit: {row[0]} is for channel: {row[1]} in {kb.identifier}")

        kb_query = CHANNEL_TO_UNIT.replace("<mnemonic>", output)
        results = kb.query(kb_query)
        for row in results:
            print(f"Found channel: {row[0]} has unit: {row[1]} in {kb.identifier}")

Found channel: http://ddhub.no/Channel/SPPA, description: Standpipe Pressure in Nf4278ed25793454daf8dd93cbd939bd4
Found channel: http://ddhub.no/Channel/SPP, description: Measure of SP Plus Sqrt(Sigma**2+Tau**2) in Nf4278ed25793454daf8dd93cbd939bd4
Found channel: http://ddhub.no/Channel/SPPA, description: 1.21 Stand Pipe Pressure - Avg; Measured stand pipe pressure, averaged over the interval. in Nfc5bc837e4194ea98a53073a2cd0c9ac
Found channel: http://ddhub.no/Channel/SPPA has unit: http://ddhub.no/Unit/KPA in Nfc5bc837e4194ea98a53073a2cd0c9ac
Found channel: http://ddhub.no/Channel/SPPA has unit: http://ddhub.no/Unit/PSI in Nfc5bc837e4194ea98a53073a2cd0c9ac


KeyboardInterrupt: 

In [12]:
kb_query

'\nPREFIX ddhub: <http://ddhub.no/>\nPREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>\n\nSELECT DISTINCT ?unit ?description\nWHERE {\n  ?unit rdf:type ddhub:Unit ;\n        ddhub:hasMnemonic SPPA ;\n        ddhub:hasDescription ?description .\n}\n'