In [45]:
import json
from openai import OpenAI
import os
import sys
from time import sleep
from string import Template
import glob

# 加载角色数据
with open('ref/Node_Dictionary.json', 'r') as file:
    node_data = json.load(file)
    
with open('ref/Special.json', 'r') as file:
    magic_data = json.load(file)


# 将角色数据转换为字典以便快速查找
node_lookup = {item['name']: item['id'] for item in node_data}
magic_lookup = {item['name']: item['id'] for item in magic_data}


# 初始化OpenAI客户端
client = OpenAI(
    api_key="",
    base_url="https://api.deepseek.com",
)

In [56]:
# static_system_prompt = """
# Extract creatures and relationships from the following text to generate a JSON file in order to construct a knowledge graph for Neo4j.

# Instructions:

# 1. Identify Creatures (nodes) and Characters (nodes) as "obj" and "sub". In the case of "obj", it is the character/creature who performs an action. In the case of "sub", it is the character/creature who receives the action directly, don't extract either "obj" or "sub" is not clear. All nodes must be found in the following dictionary.
# $node_data

# 2. Identify relationships (edges: may be actions, attitudes, or other interactions) between "obj" and "sub". For the relationships "obj" should not equal to "sub".

# 3. For the relationships field, there are 2 types, the first one is types_1: "obj" and "sub" are in the same time and space, state clearly which type of relationship it is, the second one is types_2: "obj" and "sub" are not in the same time and space, state clearly which type of relationship it is.

# 4. For the "id" field in "obj" and "sub", use the corresponding "id" from the dictionary.

# 5. We need to make sure that only the important relationships are extracted, if the relationship is not clear, do not extract it, for each input return only one json object.

# 6. For any node identified in Step 1, if the node's name is not explicitly mentioned or through pronouns in the text but is mapped to the dictrionary by (e.g., context, appearence, clothes or indirect references), prefix their id with [pred_] in the relationships. This indicates that the character was predicted or inferred rather than directly named, both "obj" and "sub" should be prefixed with [pred_] if they are not explicitly mentioned in the text.

# 7. The output should look like:
# {
#     "obj": [{"id": string, "name": string}],
#     "sub": [{"id": string, "name": string}],
#     "relations": ["obj['id']|{type_[]}HAS_ACTION|sub['id']"]
# }
# Example Output:
# {
#     "obj": [{"id": "Char_1", "name": "Harry Potter"},
#             {"id": "Char_28", "name": "Vernon Dursley"},
#             {"id": "Crea_220", "name": "Barn owl"}],
            
#     "sub": [{"id": "Chea_26", "name": "Hungarian Horntail"},
#             {"id": "Char_1", "name": "Harry Potter"},
#             {"id": "Char_5", "name": "Minerva McGonagall"},
#             {"id": "Char_10", "name": "Neville Longbottom"}],
            
#     "relations": ["Char_1|{type_1}pulled out|Crea_26",
#                 "Char_1|{type_1}receives a Nimbus Two Thousand from|Char_5",
#                 "Char_28|{type_2}notices strangely dressed people talking|Char_1",
#                 "Crea_220|{type_1}brought Neville a small package|Char_10"],
# }
# """

# static_system_prompt = """
# You are an expert text analyzer tasked with extracting references to important characters and creatures from a novel. 

# ## Task:
# 1. You will receive:
#    - A **dictionary** containing a list of important characters and creatures.
#    - The **full text of a chapter** from the novel.
# 2. Your goals:
#    - Identify which characters or creatures from the dictionary appear in the given chapter.
#    - Analyze and summarize **interactions** between each character/creature.
#    - If two entities **do not obvious interact**, do not summarize.
   
# ## **Instructions:**
# - **Entity Identification**
#   - Compare all names and aliases from `$node_data` with the chapter text.
#   - Recognize shortened forms (e.g., "Harry Potter" → "Harry").
#   - Treat names as case-insensitive.
#   - Ignore unrelated mentions (e.g., "rose" as a flower when "Rose" is a character).

# - **Interaction Analysis**
#   - For each identified character/creature pair, determine if they **directly interact** in this chapter.
#   - An **interaction** includes direct conversations, actions involving both entities, or one entity affecting another.
#   - If no interaction is found, do no summarize.

# **Output Rules**:
# - Ensure the JSON output is **fully enclosed with `{}` and `[]`**.
# - Do **not** truncate or generate incomplete JSON.
# - Avoid escape issues with quotes (`"`).
# - If there is **no interaction** between two entities, do not add interaction between them.
# - Return a properly formatted **JSON object**.

# ### **Expected JSON Output Format**
# ```json
# {
#     "Nodes": [{"id": "Crea_26", "name": "Hungarian Horntail"},
#             {"id": "Char_1", "name": "Harry Potter"},
#             {"id": "Char_5", "name": "Minerva McGonagall"},
#             {"id": "Char_10", "name": "Neville Longbottom"}]
#     "Interactions": [
#         {"subject": "Char_1", "object": "Crea_26", "summary": **summarize content**}
#         {"subject": "Char_1", "object": "Char_5", "summary": **summarize content**}
#     ]
# }

# """
static_system_prompt = """
You are an expert text analyzer tasked with extracting references to important **characters, creatures, spells, magical objects, and potions** from a novel.

## **Task:**
1. You will receive:
   - A **dictionary** containing a list of important **characters and creatures**.
   - A **dictionary** containing a list of important **spells, magical objects, and potions**.
   - The **full text of a chapter** from the novel.
2. Your goals:
   - Identify which **characters, creatures, spells, magical objects, and potions** appear in the chapter.
   - Summarize **direct interactions** between characters and creatures.
   - If an interaction involves a spell, magical object, or potion, **record them in the `"special"` field**.

## **Instructions:**
- **Entity Identification**
  - Compare all names and aliases from `$node_data` (characters & creatures) and `$magic_data` (spells, objects, potions) with the chapter text.
  - Recognize shortened forms (e.g., "Harry Potter" → "Harry", "Harry Potter" → "Potter").
  - Treat names as case-insensitive.
  - Ignore unrelated mentions (e.g., "rose" as a flower when "Rose" is a character).
  
  - **Interaction Analysis**
  - Identify **direct interactions** between characters and creatures only.
  - **An interaction is valid if:** 
    - **Characters or creatures talk to, assist, or oppose each other**.
    - **One character acts upon another (e.g., casting a spell, using an object, giving a potion, or engaging in a fight)**.
    - **Creatures attack, defend, or react to a character**.
  - **Every item in `"special"` must be explicitly linked to an action in the `"summary"` field.**

---

## **Output Rules**:
- Ensure the JSON output is **fully enclosed with `{}` and `[]`**.
- Do **not** truncate or generate incomplete JSON.
- Avoid escape issues with quotes (`"`).
- **Only include interactions where characters/creatures directly engage with each other**.
- Return a properly formatted **JSON object**.

---

### **Expected JSON Output Format**
```json
{
    "Character": [
        {"id": "Node_26", "name": "Hungarian Horntail", "type": "Creature"},
        {"id": "Node_290", "name": "Harry Potter", "type": "Character"},
        {"id": "Node_292", "name": "Ron Weasley", "type": "Character"},
        {"id": "Node_294", "name": "Minerva McGonagall", "type": "Character"},
        {"id": "Node_298", "name": "Rubeus Hagrid", "type": "Character"},
        {"id": "Node_325", "name": "Madam Pomfrey", "type": "Character"}
    ],
    "Special": [
        {"id": "Spec_412", "name": "Accio", "type": "Spell"},
        {"id": "Spec_127", "name": "Hagrid's Pink Umbrella", "type": "Magical Object"},
        {"id": "Spec_355", "name": "Felix Felicis", "type": "Potion"}
    ]
    "Interactions": [
        {
            "subject": "Node_290",
            "object": "Node_26",
            "summary": "|Node_290| battles |Node_26| in the Triwizard Tournament. |Node_26| uses |Spec_412| to summon his broomstick just in time to evade the |Node_290|'s fire, allowing him to maneuver and retrieve the golden egg.",
            "special": ["Spec_412"]
        },
        {
            "subject": "Node_290",
            "object": "Node_294",
            "summary": "|Node_294| informs |Node_290| about the upcoming task and offers encouragement, reminding |Node_290| of the importance of preparation.",
            "special": []
        },
        {
            "subject": "Node_290",
            "object": "Node_292",
            "summary": "|Node_292| apologizes to |Node_290| after the first task, acknowledging |Node_292| mistake and expressing admiration for Node_290's performance against Node_26.",
            "special": []
        },
        {
            "subject": "Node_290",
            "object": "Node_298",
            "summary": "|Node_298| congratulates |Node_290| on his victory against |Node_26|, mentioning how |Node_298| had secretly trained |Node_26| with |Spec_127| before the event.",
            "special": ["Spec_127"]
        },
        {
            "subject": "Node_290",
            "object": "Node_325",
            "summary": "|Node_325| treats |Node_290|'s burns and bruises after the battle, administering |Spec_355| to speed up |Node_290|'s recovery and ease the pain.",
            "special": ["Spec_355"]
        }
    ]
}
"""

static_system_prompt = Template(static_system_prompt).substitute(node_data=json.dumps(node_lookup, indent=4), magic_data=json.dumps(magic_lookup, indent=4))

def process_gpt(user_prompt, system_msg):
    completion = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_prompt}
        ],
        response_format={
            'type': 'json_object'
        }
    )

    nlp_results = completion.choices[0].message.content

    return nlp_results

def extract_entities_relationships(file_path, prompt_template):
    try:
        with open(file_path, 'r', encoding="utf-8") as file:
            data = json.load(file)


        text = "\n".join([chunk['passage'] for chunk in data])

        user_prompt = Template(prompt_template).substitute(
            ctext=text,
            node_data=json.dumps(node_lookup, indent=4),
            magic_data=json.dumps(magic_lookup, indent=4)
        )

        # Get response from GPT
        result = process_gpt(user_prompt, system_msg=static_system_prompt)

        if isinstance(result, str):
            try:
                result = json.loads(result)  # Ensure JSON is properly parsed
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")
                print("Raw GPT response:", result)  # Log the exact invalid response
                return None
    
        return result
    
    except Exception as e:
        print(f"Unexpected error: {e}")
        return None


In [57]:
results = extract_entities_relationships('hp4_chap19.json', "You are an expert in knowledge graph construction and the Harry Potter series. Your task here is to extract characters and relationships follow the instruction of the system message for the following text: $ctext")

print(json.dumps(results, indent=4))

{
    "Character": [
        {
            "id": "Node_290",
            "name": "Harry Potter",
            "type": "Character"
        },
        {
            "id": "Node_291",
            "name": "Hermione Granger",
            "type": "Character"
        },
        {
            "id": "Node_292",
            "name": "Ron Weasley",
            "type": "Character"
        },
        {
            "id": "Node_298",
            "name": "Rubeus Hagrid",
            "type": "Character"
        },
        {
            "id": "Node_302",
            "name": "Sirius Black",
            "type": "Character"
        },
        {
            "id": "Node_360",
            "name": "Charlie Weasley",
            "type": "Character"
        },
        {
            "id": "Node_521",
            "name": "Madame Maxime",
            "type": "Character"
        },
        {
            "id": "Node_522",
            "name": "Igor Karkaroff",
            "type": "Character"
        },
        {
       

In [58]:
# 提取实体和关系
results = extract_entities_relationships('hp4_chap1.json', "You are an expert in knowledge graph construction and the Harry Potter series. Your task here is to extract characters and relationships follow the instruction of the system message for the following text: $ctext")

# 打印最终结果
print(json.dumps(results, indent=4))

{
    "Character": [
        {
            "id": "Node_290",
            "name": "Harry Potter",
            "type": "Character"
        },
        {
            "id": "Node_306",
            "name": "Lord Voldemort",
            "type": "Character"
        },
        {
            "id": "Node_457",
            "name": "Peter Pettigrew",
            "type": "Character"
        },
        {
            "id": "Node_471",
            "name": "Bertha Jorkins",
            "type": "Character"
        },
        {
            "id": "Node_469",
            "name": "Frank Bryce",
            "type": "Character"
        },
        {
            "id": "Node_470",
            "name": "Dot",
            "type": "Character"
        },
        {
            "id": "Node_342",
            "name": "Tom",
            "type": "Character"
        },
        {
            "id": "Node_644",
            "name": "Tom Riddle",
            "type": "Character"
        }
    ],
    "Special": [
        {
        

In [None]:
# Example Output:
# {
#     "obj": [{"id": "Char_1", "name": "Harry Potter"}],
#     "sub": [{"id": "Char_3", "name": "Ron Weasley"}],
#     "relations": ["Char_1|{type_1}make friend with|Char_3"],
# },
# Example of relationship:
# {
#     Char_1|{type_1}receives a Nimbus Two Thousand from|Char_5,
#     Char_28|{type_1}walks straight into|[pred_]Char_34,
#     Char_28|{type_2}notices strangely dressed people talking|Char_1,
#     Char_1|{type_1}sends the dragon to|Char_71 with|Char_3 and Char_2,
#     Char_28|{type_1}shoos|[pred_]Char_26,
#     Char_28|{type_1}notices|[pred_]Char_26,
#     Char_28|{type_1}walks straight into|[pred_]Char_34
#     [pred_]Char_3|{type_1}looks at|Char_1
#     [pred_]Char_2|{type_1}looks at|Char_1

# }

# {
#     "obj": [{"id": string, "name": string}],
#     "sub": [{"id": string, "name": string}],
#     "relations": ["obj['id']|{type_[]}HAS_ACTION|sub['id']"]
# }
# Example Output:
# {
#     "obj": [{"id": "Char_1", "name": "Harry Potter"}],
#     "sub": [{"id": "Char_3", "name": "Ron Weasley"}],
#     "relations": ["Char_1|{type_1}make friend with|Char_3"]
# },
# {
#     "obj": [{"id": "Char_1", "name": "Harry Potter"}],
#     "sub": [{"id": "Crea_26", "name": "Hungarian Horntail"}],
#     "relations": ["Char_1|{type_1}pulled out|Crea_26"]
# },
# Example of relationship:

In [None]:
file_path = "HP1_relation_results_5chunks.json"
if not os.path.exists(file_path):
    with open(file_path, 'w') as file:
        json.dump([], file)  # Initialize with an empty list


with open(file_path, 'w') as file:
    json.dump(results, file)

cnt = 0
for i in results:
    for j in i['relations']:
        cnt += 1
        print(j)
print(cnt)