# Verkettung von Extraktion und Entitätenerkennung

1. Einrichtung der Modelle.
2. Definition der Datenstruktur (Output).
3. Vorbereitung von Beispielen und Anweisungen für die Modelle.
4. Die Hauptaufgaben sind:
    - Extrahieren von Statements: Aufteilung des historischen Textes in sinnvolle, zeitlich geordnete Aussagen.
    - Durchführen von NER: Analyse jeder Aussage, um wichtige Entitäten wie Namen, Orte und Handlungen zu identifizieren.
5. Kombination beider Aufgaben in einer Kette.
6. Ausführen.

In [1]:
import json
from dotenv import load_dotenv
from langchain_together import ChatTogether
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.runnables.base import RunnableLambda
from pydantic import BaseModel, Field
from pydantic import BaseModel
from typing import Optional
from typing import List

In [None]:
load_dotenv()

In [3]:
llm_TogehterAI_Llama3_8B = ChatTogether(
    model="meta-llama/Meta-Llama-3-8B-Instruct-Turbo",
    temperature=0,
)

llm_TogehterAI_Llama3_70B_Lite = ChatTogether(
    model="meta-llama/Meta-Llama-3-70B-Instruct-Lite",
    temperature=0,
)

In [None]:
class NEROutput(BaseModel):
    Start: str = "unknown"
    End: str = "unknown"
    # Handlung_ausgeführt_von: Optional[str] = None
    # Ort_der_Handlung: Optional[str] = None
    # übertragenes_Objekt: Optional[str] = None
    # empfänger: Optional[str] = None
    # Art_des_Verhältnisses: Optional[str] = None
    # Elternteil: Optional[str] = None

    class Config:
        extra = "allow"  # Allow extra fields



parser_ner = PydanticOutputParser(pydantic_object=NEROutput)

format_instructions_NER = parser_ner.get_format_instructions()
print(format_instructions_NER)

In [None]:
class Statement(BaseModel):
    text: str = Field(description="An informative sentence extracted from the historical text.")
    ner_output: Optional[NEROutput] = None  # include NER output

class StatementGroup(BaseModel):
    date: str = Field(description="The date associated with the group of statements.")
    statements: List[Statement] = Field(description="A list of statements associated with the date.")

class AssistantOutput(BaseModel):
    statement_groups: List[StatementGroup] = Field(description="A list of statement groups")

parser_statement_extraction = PydanticOutputParser(pydantic_object=AssistantOutput)


format_instructions_statement_extraction = parser_statement_extraction.get_format_instructions()
print(format_instructions_statement_extraction)

In [6]:

with open('data/examples.json', 'r', encoding='utf-8') as file:
    examples = json.load(file)

system_msg = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a meticulous analyst with a focus on historical data. Your task is to process a provided historical record. For each given text please identify all statements that focus on the relationships and dependencies between entities in the given text, including biographical data. Also, structure the information, meaning:  Divide the text into two time based logical segments, each containing a self-contained piece of information (e.g., JJJJ-MM-TT, before JJJJ-MM-TT)
Return the statements in german as simple sentences that allow for easy interpretation and reconstruction of the historical context. Each sentence must be informative in itself. Please ensure to include names in each sentence as demonstrated:
John Doe goes to the store. John Doe buys apples.<|eot_id|><|start_header_id|>user<|end_header_id|>

"""

example_user_msg = """Please produce statements from the following text:
{example_input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

example_assistant_msg = """{example_output}<|eot_id|><|start_header_id|>user<|end_header_id|>
"""

user_msg = """Please produce statements from the following text:
{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

prompt_template_statement_extraction = ChatPromptTemplate.from_messages([
    ("system", system_msg),
    
    ("user", example_user_msg.replace("{example_input}", "{example_input_1}")),
    ("assistant", example_assistant_msg.replace("{example_output}", "{example_output_1}")),
    
    ("user", example_user_msg.replace("{example_input}", "{example_input_2}")),
    ("assistant", example_assistant_msg.replace("{example_output}", "{example_output_2}")),
    
    ("user", user_msg),
])

prompt_template_statement_extraction = prompt_template_statement_extraction.partial(format_instructions=format_instructions_statement_extraction)

example_input_1 = examples[0]["user_input"]

example_output_1 = examples[0]["assistant_output_json"]

example_input_2 = examples[1]["user_input"]

example_output_2 = examples[1]["assistant_output_json"]


regeste = """8652 1564 März 3, Prag.

Erzherzog Ferdinand schenkt der edlen Philippina Welserin sonderlich ihres in Ehren und Tugend wohlverhaltens halben Schloss und Herrschaft Ambras samt allem Bau, Haus und Vorrat, das ihm sein Vater Kaiser Ferdinand I. geschenksweise überlassen habe.

Geschehen und gegeben zu Prag den 3. Tag des Monats März nach Christi unseres Herrn Geburt im 1564.

Eigenhändig unterschriebenes Originalpergament mit abgefallenem Siegel, Urkunden des Familienarchivs."""

variables_statement_extraction = {
    "example_input_1": example_input_1,
    "example_output_1": example_output_1,
    "example_input_2": example_input_2,
    "example_output_2": example_output_2,
    "question": regeste,
}

statement_extraction_chain = prompt_template_statement_extraction | llm_TogehterAI_Llama3_70B_Lite | parser_statement_extraction


In [None]:
print(statement_extraction_chain)

In [8]:
with open('data/examples_NER.json', 'r', encoding='utf-8') as file:
    examples = json.load(file)

system_msg = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a meticulous analyst with a focus on historical data. Your task is to produce a structured output for each given text.

Your goal is to extract information about interactions, activities, relationships, and biographical details to enrich our prosopographical database.<|eot_id|><|start_header_id|>user<|end_header_id|>"""

example_user_msg = """Please produce a structured output describing the action from the following text:
{example_input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

example_assistant_msg = """{example_output}<|eot_id|><|start_header_id|>user<|end_header_id|>
"""

user_msg = """

Please produce a structured output describing the action from the following text:
{statement}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

prompt_template_NER = ChatPromptTemplate.from_messages([
    ("system", system_msg),
    
    ("user", example_user_msg.replace("{example_input}", "{example_input_1}")),
    ("assistant", example_assistant_msg.replace("{example_output}", "{example_output_1}")),
    
    ("user", example_user_msg.replace("{example_input}", "{example_input_2}")),
    ("assistant", example_assistant_msg.replace("{example_output}", "{example_output_2}")),
    
    ("user", user_msg),
])

prompt_template_NER = prompt_template_NER.partial(format_instructions=format_instructions_NER)

example_input_1 = examples[0]["user_input"]

example_output_1 = examples[0]["assistant_output_json"]

example_input_2 = examples[1]["user_input"]

example_output_2 = examples[1]["assistant_output_json"]

chain_NER = prompt_template_NER | llm_TogehterAI_Llama3_8B | parser_ner

In [None]:
# Define the RunnableLambda to process statements and attach NER outputs
def process_statements_with_NER(result_dict):
    for statement_group in result_dict.statement_groups:
        for statement in statement_group.statements:
            # Create a dictionary for each statement
            statement_variables = {
                "example_input_1": example_input_1,
                "example_output_1": example_output_1,
                "example_input_2": example_input_2,
                "example_output_2": example_output_2,
                "statement": statement.text,
            }
            # Run chain_NER on the statement
            ner_output = chain_NER.invoke(statement_variables)
            # Assign the NER output to the statement's 'ner_output' key
            statement.ner_output = ner_output
    return result_dict  # This is now a dict that can be converted to JSON

# Create the RunnableLambda
runnable_process_statements = RunnableLambda(process_statements_with_NER)

# Combine the chains into a final chain
final_chain = statement_extraction_chain | runnable_process_statements


# Invoke the final chain with the initial variables
result = final_chain.invoke(variables_statement_extraction)

# Print the final output
print(result)


In [None]:
result_dict = result.dict()

json_output = json.dumps(result_dict, indent=4, ensure_ascii=False)

print(json_output)