In [1]:
import json
import os
import csv
import rdflib
from rdflib import Graph, URIRef, Literal, Namespace, BNode
from rdflib.namespace import SKOS, DCTERMS, DCMITYPE, RDF, RDFS, XSD, PROV, SDO, TIME, split_uri

from openai import OpenAI
from google import genai
from google.genai import types
import anthropic

import re
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import networkx as nx
from collections import defaultdict

import PIL.Image
import itertools

In [8]:
# Opening config file, the config structure is:
# "openai_api_key":"......"}

config = open('config', 'r')
config = json.load(config)

os.environ['OPENAI_API_KEY'] = config['openai_api_key']
os.environ['GEMINI_API_KEY'] = config['gemini_api_key']
os.environ['XAI_API_KEY'] = config['xai_api_key']
os.environ['NVIDIA_API_KEY'] = config['nvidia_api_key']
os.environ['DEEPSEEK_API_KEY'] = config['deepseek_api_key']
os.environ['ANTHROPIC_API_KEY'] = config['claude_api_key']
os.environ['DASHSCOPE_API_KEY'] = config['dashscope_api_key']

In [43]:
def convert_ttl_to_json(input_file_path, output_file_path):
    """
    Converts a TTL-like text file to a JSON file with a specific structure.

    This function reads each line from the input file, treats it as a string
    for the "rdf_star" field, and adds two additional empty fields.

    Args:
        input_file_path (str): The path to the input .ttl file.
        output_file_path (str): The path where the output .json file will be saved.
    """
    dataset = []
    try:
        with open(input_file_path, 'r', encoding='utf-8') as f_in:
            for line in f_in:
                # Strip leading/trailing whitespace, including newlines
                stripped_line = line.strip()
                if stripped_line:  # Ensure the line is not empty
                    record = {
                        "rdf_star": stripped_line,
                        "zero_text_qwen3:235b": "",
                        "one_text_qwen3:235b": ""
                    }
                    dataset.append(record)

        # Create the final JSON structure
        final_json = {"dataset": dataset}

        # Write the JSON data to the output file
        with open(output_file_path, 'w', encoding='utf-8') as f_out:
            json.dump(final_json, f_out, indent=2)

        print(f"Successfully converted {input_file_path} to {output_file_path}")

    except FileNotFoundError:
        print(f"Error: The file '{input_file_path}' was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# --- How to use the script ---

# 1. Make sure this Python script is in the same directory as your input file.
# 2. Change 'out_A_B_filtered.ttl' to the actual name of your input file if it's different.
# 3. The output will be saved as 'output.json' in the same directory.

if __name__ == '__main__':
    # Define the input and output file names
    input_filename = 'out_A_B_filtered.ttl'
    output_filename = 'rdf_verbal_dataset.json'

    # Run the conversion
    convert_ttl_to_json(input_filename, output_filename)


Successfully converted out_A_B_filtered.ttl to rdf_verbal_dataset.json


In [44]:
# Load the JSON dataset
with open('rdf_verbal_dataset.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Access the dataset
dataset = data['dataset']

### Convert text to RDF triples using LLMs: zero-shot prompting

In [45]:
system_prompt_0t = """# Prompt for Knowledge Graph Verbalization

**Objective:** Your task is to convert a set of RDF triples, representing a small knowledge graph, into a coherent, natural-language paragraph.

**Input Format:**
You will be given a list of RDF triples. Each triple is structured as `(Subject, Predicate, Object)`. All subjects, predicates, and objects are provided as human-readable string labels.

**Core Instructions:**

1.  **Synthesize, Don't Just List:** Do not simply state each triple as a standalone fact. Instead, analyze the connections between the triples. When the same entity appears as a subject or object in multiple triples, use this connection to form more complex and natural sentences.

2.  **Accurate Verbalization:** Translate the `(Subject, Predicate, Object)` structure into grammatically correct sentences. The predicate represents the relationship between the subject and the object.

3.  **Qualitative Probability:** When a "probability" predicate is present, you **must** convert the numerical percentage into a natural, qualitative, textual description. Do not use the raw percentage in the final text.

4.  **Strict Data Adherence:** This is the most important rule. **You must not add any information that is not explicitly present in the provided set of triples.** Do not make assumptions, infer missing relationships, or use any external knowledge. The output text must be a direct and faithful verbalization of *only* the input data.

5.  **Coherent Narrative:** The final output should be a single, well-formed paragraph that reads smoothly."""

In [46]:
system_prompt_1t = """# Prompt for Knowledge Graph Verbalization

**Objective:** Your task is to convert a set of RDF triples, representing a small knowledge graph, into a coherent, natural-language paragraph.

**Input Format:**
You will be given a list of RDF triples. Each triple is structured as `(Subject, Predicate, Object)`. All subjects, predicates, and objects are provided as human-readable string labels.

**Core Instructions:**

1.  **Synthesize, Don't Just List:** Do not simply state each triple as a standalone fact. Instead, analyze the connections between the triples. When the same entity appears as a subject or object in multiple triples, use this connection to form more complex and natural sentences.

2.  **Accurate Verbalization:** Translate the `(Subject, Predicate, Object)` structure into grammatically correct sentences. The predicate represents the relationship between the subject and the object.

3.  **Qualitative Probability:** When a "probability" predicate is present, you **must** convert the numerical percentage into a natural, qualitative, textual description. Do not use the raw percentage in the final text.

4.  **Strict Data Adherence:** This is the most important rule. **You must not add any information that is not explicitly present in the provided set of triples.** Do not make assumptions, infer missing relationships, or use any external knowledge. The output text must be a direct and faithful verbalization of *only* the input data.

5.  **Coherent Narrative:** The final output should be a single, well-formed paragraph that reads smoothly.

---

### Example

**Input RDF Triples in RDF-star:**
<< ex:Fourier-multiplier-operator-with-symbol-that-satisfies-Hytonen-anisotropic-Mihlin-type-condition ex:is_same_as ex:Fourier-multiplier-operator-that-is-bounded-on-lp >> ex:qualifier "=100%" .
**Correct Output Text:**
"It is certain that a Fourier multiplier operator with a symbol that satisfies the Hytonen anisotropic Mihlin-type condition is the same as a Fourier multiplier operator that is bounded on Lp."

**Incorrect Output (Violates Rule #3 by using the numerical probability):**
"There is a 100% probability that a Fourier multiplier operator with a symbol that satisfies the Hytonen anisotropic Mihlin-type condition is the same as a Fourier multiplier operator that is bounded on Lp."
"""

In [49]:
from tqdm import tqdm

# --- Configuration ---
# IMPORTANT: You must set the DASHSCOPE_API_KEY as an environment variable
# for this script to work. For example, in your terminal:
# export DASHSCOPE_API_KEY='your_actual_api_key'

# The API key is fetched from environment variables
api_key = os.getenv("DASHSCOPE_API_KEY")
if not api_key:
    raise ValueError("API key not found. Please set the DASHSCOPE_API_KEY environment variable.")

# The base URL for the Dashscope service
BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1"

# The model you want to use
MODEL_NAME = "qwen3-4b"
# Fixed system prompt for every call
SYSTEM_PROMPT = system_prompt_1t

# User prompt template with a placeholder for the RDF-star content
USER_PROMPT_TEMPLATE = """Now, please verbalize the following set of RDF triples into a single, concise paragraph, following all the rules above.

**Output Format:** Your response must contain **only the final verbalized paragraph** in plaintext and nothing else. Do not add any introductory phrases, comments, or explanations.

**Input RDF Triples in RDF-star:**
{}"""


# --- File Paths ---
INPUT_JSON_PATH = 'rdf_verbal_qwen3_235b_full.json'
OUTPUT_JSON_PATH = 'rdf_verbal_qwen3_4b_full.json'


def get_llm_response(client, user_prompt):
    """
    Calls the LLM API using the provided client and user prompt, handling the streaming response.

    Args:
        client (OpenAI): The initialized OpenAI client.
        user_prompt (str): The full user prompt to be sent to the LLM.

    Returns:
        str: The complete text content from the LLM's response.
    """
    try:
        completion = client.chat.completions.create(
            model=MODEL_NAME,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": user_prompt}
            ],
            extra_body={"enable_thinking": True},
            stream=True
        )

        answer_content = ""
        for chunk in completion:
            # Skip chunks that are for usage stats
            if not chunk.choices:
                continue
            
            delta = chunk.choices[0].delta
            
            if hasattr(delta, "content") and delta.content:
                answer_content += delta.content

        return answer_content.strip()

    except Exception as e:
        error_message = f"Error: An exception occurred during the API call: {e}"
        print(error_message)
        return error_message


def process_records(input_path, output_path):
    """
    Loads records from the input JSON, processes each one with the LLM,
    and saves the updated records to the output JSON.
    """
    # 1. Load the input JSON file
    try:
        with open(input_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except FileNotFoundError:
        print(f"Error: Input file not found at '{input_path}'")
        return
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from '{input_path}'")
        return

    dataset = data.get("dataset", [])
    if not dataset:
        print("Input file does not contain a 'dataset' list or the list is empty.")
        return

    # Initialize the LLM client once
    client = OpenAI(
        api_key=api_key,
        base_url=BASE_URL,
    )

    # 2. Process each record sequentially with a progress bar
    print(f"Processing {len(dataset)} records...")
    for record in tqdm(dataset):
        rdf_star_text = record.get("rdf_star", "")
        if rdf_star_text:
            # Format the full user prompt using the template
            user_prompt = USER_PROMPT_TEMPLATE.format(rdf_star_text)
            
            # Get the LLM response for the current record
            llm_response = get_llm_response(client, user_prompt)
            
            # Update the record with the response
            record["one_text_qwen3:4b"] = llm_response

    # 3. Save the updated data to the output JSON file
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

    print(f"\nProcessing complete. Updated data saved to '{output_path}'")


# --- Main execution block ---
if __name__ == '__main__':
    process_records(INPUT_JSON_PATH, OUTPUT_JSON_PATH)

Processing 68 records...


100%|██████████| 68/68 [19:25<00:00, 17.14s/it]


Processing complete. Updated data saved to 'rdf_verbal_qwen3_4b_full.json'





In [52]:
import json

def add_field_to_json(source_path, target_path, key_to_add='zero_text_qwen3:4b', id_key='rdf_star'):
    """
    Reads a source JSON and a target JSON file. For each object in the target
    file's dataset, it finds the corresponding object in the source dataset
    (using a unique id_key) and adds a specified key-value pair from source to target.

    Args:
        source_path (str): The path to the JSON file to read the new data from.
        target_path (str): The path to the JSON file that will be modified.
        key_to_add (str): The name of the key to add to the target file.
        id_key (str): The key to use as a unique identifier to match objects.
    """
    try:
        # Step 1: Read the source file and create a lookup map for efficient access.
        # The map will store {id_key: value_to_add}.
        with open(source_path, 'r', encoding='utf-8') as f:
            source_data = json.load(f)

        lookup_map = {}
        if 'dataset' in source_data and isinstance(source_data['dataset'], list):
            for item in source_data['dataset']:
                # Ensure both the identifier and the key to add exist in the source item
                if id_key in item and key_to_add in item:
                    identifier = item[id_key]
                    value = item[key_to_add]
                    lookup_map[identifier] = value
        else:
            print(f"Warning: 'dataset' key not found or is not a list in the source file '{source_path}'.")
            return # Exit if source data is not as expected

        # Step 2: Read the target file that we are going to modify.
        with open(target_path, 'r', encoding='utf-8') as f:
            target_data = json.load(f)

        # Step 3: Iterate through the target data and add the new key-value pair.
        items_modified = 0
        if 'dataset' in target_data and isinstance(target_data['dataset'], list):
            for item in target_data['dataset']:
                if id_key in item:
                    identifier = item[id_key]
                    # If we find a matching record in our lookup map, add the new field.
                    if identifier in lookup_map:
                        item[key_to_add] = lookup_map[identifier]
                        items_modified += 1
        else:
            print(f"Warning: 'dataset' key not found or not a list in the target file '{target_path}'.")
            return

        # Step 4: Write the modified data back to the target file, overwriting it.
        with open(target_path, 'w', encoding='utf-8') as f:
            json.dump(target_data, f, indent=4, ensure_ascii=False)

        print(f"✅ Processing complete!")
        print(f"Updated {items_modified} records in the file: {target_path}")

    except FileNotFoundError as e:
        print(f"❌ Error: The file '{e.filename}' was not found.")
        print("Please make sure the file exists and the path is correct.")
    except json.JSONDecodeError:
        print(f"❌ Error: Could not decode JSON from one of the files.")
        print("Please ensure both files contain valid JSON.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# --- How to use this script ---

# 1. Make sure this python script is in the same directory as your JSON files.
# 2. Define the paths for your source and target files.
#    Replace the placeholder names with your actual file names.

# The file you want to get the 'zero_text_qwen3:4b' value FROM.
source_file_path = '../rdf_verbal_qwen3-4b_full.json'

# The file you want to add the 'zero_text_qwen3:4b' value TO.
# IMPORTANT: This file will be read and then overwritten with the changes.
target_file_path = 'rdf_verbal_qwen3_full.json'

# 3. Call the function with your file paths.
add_field_to_json(
    source_path=source_file_path,
    target_path=target_file_path
)

✅ Processing complete!
Updated 68 records in the file: rdf_verbal_qwen3_full.json


### Evaluate different strategies

In [62]:
import pandas as pd
from openai import OpenAI, OpenAIError
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os

client = OpenAI()

# --- Configuration ---

# --- OpenAI Client Setup ---
# This script uses the OpenAI Python library version 1.0.0 or later.
# The client automatically looks for the "OPENAI_API_KEY" environment variable.
# Make sure it is set in your environment before running the script.
try:
    client = OpenAI()
    # A quick test to see if the client is configured with a valid key
    client.models.list()
    print("OpenAI client initialized successfully.")
except OpenAIError as e:
    print(f"Error initializing OpenAI client: {e}")
    print("Please ensure your OPENAI_API_KEY environment variable is set correctly.")
    exit()


# Input and Output file paths for Excel files
INPUT_FILE_PATH = 'beichen_final.xlsx'
OUTPUT_FILE_PATH = 'beichen_final_with_matches.xlsx'

# The list of words to compare against
# FIX: Converted all words to lowercase for case-insensitive comparison.
BINS = [
    "certain", "almost certain", "highly likely", "very good chance",
    "we believe", "likely", "probable", "probably", "better than even",
    "about even", "probably not", "we doubt", "unlikely", "improbable",
    "chances are slight", "little chance", "highly unlikely",
    "almost no chance", "impossible"
]

# The column in your Excel file that contains the words to be analyzed
COLUMN_NAME = "prob_word"

# --- Functions ---

def get_embeddings(texts, model="text-embedding-3-large"):
    """
    Gets embeddings for a list of texts using the OpenAI API.
    This function is updated for openai library v1.0.0+.
    """
    try:
        # Use the global client object's 'embeddings.create' method
        response = client.embeddings.create(input=texts, model=model)
        # The response from the v1.0.0+ library is an object, not a dictionary.
        # Access its attributes using dot notation (e.g., response.data).
        # Each item in response.data is an Embedding object with an 'embedding' attribute.
        embeddings = [item.embedding for item in response.data]
        return embeddings
    except OpenAIError as e:
        print(f"An error occurred with the OpenAI API: {e}")
        return None

def find_best_match(prob_word_embeddings, bin_embeddings, bins_list):
    """
    Finds the best matching bin for each probability word based on cosine similarity.
    """
    if not prob_word_embeddings or not bin_embeddings:
        return [], []

    # Calculate cosine similarity between all prob_words and all bins
    similarity_matrix = cosine_similarity(prob_word_embeddings, bin_embeddings)

    # Find the index of the bin with the highest similarity for each prob_word
    best_match_indices = np.argmax(similarity_matrix, axis=1)

    # Get the best matching words and their corresponding scores
    best_matches = [bins_list[i] for i in best_match_indices]
    best_scores = [similarity_matrix[i, j] for i, j in enumerate(best_match_indices)]

    return best_matches, best_scores

def process_file(input_path, output_path, column_name, bins_list):
    """
    Main function to read the file, process the data, and write the output.
    """
    # 1. Read the input Excel file
    try:
        # Use read_excel for .xlsx files
        df = pd.read_excel(input_path)
        print(f"Successfully read {input_path}")
    except FileNotFoundError:
        print(f"Error: The file '{input_path}' was not found.")
        return
    except Exception as e:
        print(f"An error occurred while reading the Excel file: {e}")
        print("Please ensure 'openpyxl' is installed: pip install openpyxl")
        return

    # 2. Get the list of words from the specified column
    # FIX: Ensure all items are strings and convert to lowercase to avoid case sensitivity issues.
    prob_words = df[column_name].astype(str).str.lower().tolist()
    print(f"Found {len(prob_words)} words in column '{column_name}' and converted them to lowercase.")

    # 3. Get embeddings for both lists of words
    print("Getting embeddings from OpenAI...")
    prob_word_embeddings = get_embeddings(prob_words)
    bin_embeddings = get_embeddings(bins_list)

    if not prob_word_embeddings or not bin_embeddings:
        print("Could not retrieve embeddings. Exiting.")
        return

    print("Embeddings received successfully.")

    # 4. Find the best match for each word
    print("Calculating similarities and finding best matches...")
    best_matches, best_scores = find_best_match(prob_word_embeddings, bin_embeddings, bins_list)

    if not best_matches:
        print("Could not find best matches. Exiting.")
        return

    # 5. Add the results to the DataFrame
    df['semantic_match'] = best_matches
    df['similarity_score'] = best_scores
    print("Added new columns 'semantic_match' and 'similarity_score' to the data.")

    # 6. Write the updated DataFrame to a new Excel file
    try:
        # Use to_excel for .xlsx files
        df.to_excel(output_path, index=False)
        print(f"Successfully wrote the results to {output_path}")
    except IOError:
        print(f"Error: Could not write to the file '{output_path}'.")
    except Exception as e:
        print(f"An error occurred while writing the Excel file: {e}")


# --- Main Execution ---
if __name__ == "__main__":
    process_file(INPUT_FILE_PATH, OUTPUT_FILE_PATH, COLUMN_NAME, BINS)

OpenAI client initialized successfully.
Successfully read beichen_final.xlsx
Found 168 words in column 'prob_word' and converted them to lowercase.
Getting embeddings from OpenAI...
Embeddings received successfully.
Calculating similarities and finding best matches...
Added new columns 'semantic_match' and 'similarity_score' to the data.
Successfully wrote the results to beichen_final_with_matches.xlsx
