# Verkettung von Textextraktion, NER und Namensauflösung

1. Einrichtung der Modelle.
2. Definition der Datenstruktur (Output).
3. Vorbereitung von Beispielen und Anweisungen für die Modelle.
4. Die Hauptaufgaben sind:
    - Extrahieren von Aussagen: Aufteilung des historischen Textes in sinnvolle, zeitlich geordnete Aussagen.
    - Durchführen von NER: Analyse jeder Aussage, um wichtige Entitäten wie Namen, Orte und Handlungen zu identifizieren.
    - Identifikation von Personen: Mithilfe von Embeddings und Sprachmodellen die beste Übereinstimmung für jede identifizierte Entität aus einer Datenbank finden:
        1. Alle Namen (mit IDs) in einer Vektordatenbank speichern.
        2. Für jede Person die drei Top-Matches aus der Vektordatenbank extrahieren.
        3. Die drei Top-Matches zusammen mit dem Personennamen und dem Kontext (Statement) an ein Sprachmodell schicken, um eine Entscheidung zu treffen.
5. Kombination aller Aufgaben zu einer durchgehenden Prozesskette.
6. Ausführen.

In [1]:
import json
from dotenv import load_dotenv
from langchain_together import ChatTogether
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.runnables.base import RunnableLambda
from pydantic import BaseModel, Field
from pydantic import BaseModel
from typing import Optional, List, Union

In [None]:
load_dotenv()

In [3]:
llm_TogehterAI_Llama3_8B = ChatTogether(
    model="meta-llama/Meta-Llama-3-8B-Instruct-Turbo",
    temperature=0,
)

llm_TogehterAI_Llama3_70B_Lite = ChatTogether(
    model="meta-llama/Meta-Llama-3-70B-Instruct-Lite",
    temperature=0,
)

In [4]:
class NEROutput(BaseModel):
    Start: str = "unknown"
    End: str = "unknown"
    # Handlung_ausgeführt_von: Optional[str] = None
    # Ort_der_Handlung: Optional[str] = None
    # übertragenes_Objekt: Optional[str] = None
    # empfänger: Optional[str] = None
    # Art_des_Verhältnisses: Optional[str] = None
    # Elternteil: Optional[str] = None
    # Kind: Optional[str] = None

    class Config:
        extra = "allow"  # Allow extra fields

parser_ner = PydanticOutputParser(pydantic_object=NEROutput)
format_instructions_NER = parser_ner.get_format_instructions()
#print(format_instructions_NER)



In [None]:
class BestMatchOutput(BaseModel):
    best_match_name: Union[str, None] = Field(
        ..., description="The name of the best match, or 'None' if none are correct."
    )

parser_best_match = PydanticOutputParser(pydantic_object=BestMatchOutput)

format_instructions_best_match = parser_best_match.get_format_instructions()
print(format_instructions_best_match)

In [6]:
class Statement(BaseModel):
    text: str = Field(description="An informative sentence extracted from the historical text.")
    ner_output: Optional[NEROutput] = None

class StatementGroup(BaseModel):
    date: str = Field(description="The date associated with the group of statements.")
    statements: List[Statement] = Field(description="A list of statements associated with the date.")

class AssistantOutput(BaseModel):
    statement_groups: List[StatementGroup] = Field(description="A list of statement groups, each associated with a date.")

parser_statement_extraction = PydanticOutputParser(pydantic_object=AssistantOutput)
format_instructions_statement_extraction = parser_statement_extraction.get_format_instructions()
#print(format_instructions_statement_extraction)

In [7]:
with open('data/examples.json', 'r', encoding='utf-8') as file:
    examples = json.load(file)

system_msg = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a meticulous analyst with a focus on historical data. Your task is to process a provided historical record. For each given text please identify all statements that focus on the relationships and dependencies between entities in the given text, including biographical data. Also, structure the information, meaning:  Divide the text into two time based logical segments, each containing a self-contained piece of information (e.g., JJJJ-MM-TT, before JJJJ-MM-TT)
Return the statements in german as simple sentences that allow for easy interpretation and reconstruction of the historical context. Each sentence must be informative in itself. Please ensure to include names in each sentence as demonstrated:
John Doe goes to the store. John Doe buys apples.<|eot_id|><|start_header_id|>user<|end_header_id|>

"""

example_user_msg = """Please produce statements from the following text:
{example_input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

example_assistant_msg = """{example_output}<|eot_id|><|start_header_id|>user<|end_header_id|>
"""

user_msg = """Please produce statements from the following text:
{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

prompt_template_statement_extraction = ChatPromptTemplate.from_messages([
    ("system", system_msg),
    
    ("user", example_user_msg.replace("{example_input}", "{example_input_1}")),
    ("assistant", example_assistant_msg.replace("{example_output}", "{example_output_1}")),
    
    ("user", example_user_msg.replace("{example_input}", "{example_input_2}")),
    ("assistant", example_assistant_msg.replace("{example_output}", "{example_output_2}")),
    
    ("user", user_msg),
])

prompt_template_statement_extraction = prompt_template_statement_extraction.partial(format_instructions=format_instructions_statement_extraction)

example_input_1 = examples[0]["user_input"]

example_output_1 = examples[0]["assistant_output_json"]

example_input_2 = examples[1]["user_input"]

example_output_2 = examples[1]["assistant_output_json"]


regeste = """8652 1564 März 3, Prag.

Erzherzog Ferdinand schenkt der edlen Philippina Welserin sonderlich ihres in Ehren und Tugend wohlverhaltens halben Schloss und Herrschaft Ambras samt allem Bau, Haus und Vorrat, das ihm sein Vater Kaiser Ferdinand I. geschenksweise überlassen habe.

Geschehen und gegeben zu Prag den 3. Tag des Monats März nach Christi unseres Herrn Geburt im 1564.

Eigenhändig unterschriebenes Originalpergament mit abgefallenem Siegel, Urkunden des Familienarchivs."""

variables_statement_extraction = {
    "example_input_1": example_input_1,
    "example_output_1": example_output_1,
    "example_input_2": example_input_2,
    "example_output_2": example_output_2,
    "question": regeste,
}

statement_extraction_chain = prompt_template_statement_extraction | llm_TogehterAI_Llama3_70B_Lite | parser_statement_extraction


In [8]:
with open('data/examples_NER.json', 'r', encoding='utf-8') as file:
    examples = json.load(file)

system_msg = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a meticulous analyst with a focus on historical data. Your task is to produce a structured output for each given text.

Your goal is to extract information about interactions, activities, relationships, and biographical details to enrich our prosopographical database.<|eot_id|><|begin_of_text|><|start_header_id|>user<|end_header_id|>"""

example_user_msg = """Please produce a structured output describing the action from the following text:
{example_input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

example_assistant_msg = """{example_output}<|eot_id|><|start_header_id|>user<|end_header_id|>
"""

user_msg = """

Please produce a structured output describing the action from the following text:
{statement}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

prompt_template_NER = ChatPromptTemplate.from_messages([
    ("system", system_msg),

    ("user", example_user_msg.replace("{example_input}", "{example_input_1}")),
    ("assistant", example_assistant_msg.replace("{example_output}", "{example_output_1}")),

    ("user", example_user_msg.replace("{example_input}", "{example_input_2}")),
    ("assistant", example_assistant_msg.replace("{example_output}", "{example_output_2}")),

    ("user", user_msg),
])

prompt_template_NER = prompt_template_NER.partial(format_instructions=format_instructions_NER)

example_input_1 = examples[0]["user_input"]

example_output_1 = examples[0]["assistant_output_json"]

example_input_2 = examples[1]["user_input"]

example_output_2 = examples[1]["assistant_output_json"]

chain_NER = prompt_template_NER | llm_TogehterAI_Llama3_8B | parser_ner

In [None]:
#Load  CSV data and create embeddings
import pandas as pd
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

df = pd.read_csv('data/names_ids.csv')

embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

names = df['name'].tolist()
ids = df['id'].tolist()

metadatas = [{'id': id_, 'name': name} for id_, name in zip(ids, names)]

vector_store = FAISS.from_texts(names, embedding_model, metadatas=metadatas)

# Retriever (TODO: @chain decorator to the function to create a Runnable)
def find_top_matches_with_context(name, full_statement, top_k=3):
    # name_with_context = f"{name} - {full_statement}"
    
    results = vector_store.similarity_search_with_relevance_scores(name, k=top_k)
    
    matches = []
    
    for doc, similarity_score in results:
        # Convert np.float32 to Python float
        python_float_score = float(similarity_score)
        match = {
            'score': python_float_score,
            'name': doc.metadata.get('name', ''),
            'id': doc.metadata['id'],
        }
        matches.append(match)
    return matches

In [10]:
system_msg = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
    
You are an assistant that helps clarify name matches based on contextual information. Based on the context of the statement, return the correct match.<|eot_id|><|start_header_id|>user<|end_header_id|>
"""

user_msg = """You are given the following statement: "{statement}"
The name mentioned is: "{name}"

Here are three possible matches:
{match_1}
{match_2}
{match_3}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

example_user_msg = """You are given the following statement: Maximilian befiehlt Johannes Ried das 'Riesenbuch' zu schreiben.
The name mentioned is: Maximilian

Here are three possible matches:
Maximilian I., römischer König und Kaiser (1486/1508-1519)
Maximilian Prandstetter, Bürger und Umgelter zu Linz an der Donau
Maximilian Stürtzel, Kleriker, Sohn von Konrad Stürtzel dem Älteren<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

example_assistant_msg = """{example_output}<|eot_id|><|start_header_id|>user<|end_header_id|>
"""

prompt_template_RAG = ChatPromptTemplate([
    ("system", system_msg),

    ("user", example_user_msg),
    ("assistant", example_assistant_msg),

    ("user", user_msg),
])

prompt_template_RAG = prompt_template_RAG.partial(format_instructions=format_instructions_best_match)

example_output_best_match = """
{"best_match_name": "Maximilian I., römischer König und Kaiser (1486/1508-1519)"}
"""

decide_best_match_chain = prompt_template_RAG | llm_TogehterAI_Llama3_70B_Lite | parser_best_match

In [11]:
# Function to send top matches to LLM for final decision
def decide_best_match_via_llm(name, statement, top_matches):

    variables_statement_RAG = {
    "statement": statement,
    "name": name,
    "match_1": top_matches[0]['name'],
    "match_2": top_matches[1]['name'],
    "match_3": top_matches[2]['name'],
    "example_output": example_output_best_match,
    }

    result = decide_best_match_chain.invoke(variables_statement_RAG)

    return result.best_match_name

In [12]:
# Function to perform NER on a single statement
def ner_on_statement(result_dict):
    for statement_group in result_dict.statement_groups:
        for statement in statement_group.statements:
            statement_variables = {
                "example_input_1": example_input_1,
                "example_output_1": example_output_1,
                "example_input_2": example_input_2,
                "example_output_2": example_output_2,
                "statement": statement.text,
            }
            ner_output = chain_NER.invoke(statement_variables)
            statement.ner_output = ner_output
    return result_dict

In [13]:
def rag_on_ner_output(result_dict):
    name_fields = ['Handlung_ausgeführt_von', 'empfänger', 'Elternteil', 'Kind', 'Person']
    for statement_group in result_dict.statement_groups:
        for statement in statement_group.statements:
            ner_output = statement.ner_output
            for field in name_fields:
                name = getattr(ner_output, field, None)
                if name:
                    top_matches = find_top_matches_with_context(name, statement.text)
                    setattr(ner_output, f'{field}_matches', top_matches)
                    best_match_name = decide_best_match_via_llm(name, statement.text, top_matches)
                    setattr(ner_output, f'{field}_best_match', best_match_name)
    return result_dict

In [14]:
runnable_process_statements = RunnableLambda(ner_on_statement)
runnable_best_match = RunnableLambda(rag_on_ner_output)

# Combine the chains into a final chain
final_chain = statement_extraction_chain | runnable_process_statements | runnable_best_match

result = final_chain.invoke(variables_statement_extraction)

#print(result)

In [None]:
result_dict = result.dict()
json_output = json.dumps(result_dict, indent=4, ensure_ascii=False)
print(json_output)