In [91]:
from typing import Dict, List, TypedDict
import spacy
from textacy import extract, preprocessing
from typing import Annotated
import operator
from langgraph.graph import StateGraph, START, END
from spacy.tokens import Doc
from langchain.chat_models import init_chat_model
from dotenv import load_dotenv
from pydantic import BaseModel, Field

In [92]:
load_dotenv()

#Danh sách từ không được tính là Actors
INTERNAL_SYSTEM_KEYWORDS = {"system", "software", "application", "app", "platform"}

model = init_chat_model(
  "gpt-5-mini",
  model_provider="openai",
) 

In [93]:
class ValidationResult(BaseModel):
  is_valid: bool = Field(description="True if the data is accurate, False if it is not valid")
  reason: str = Field(description="The reason why the data is not valid")
  
class AliasItem(BaseModel):
  alias: str = Field(description="Name of the alias")
  sentences: List[int] = Field(description="List of sentence indices where THIS specific alias appears (starting from 1)")

class ActorAlias(BaseModel):
  actor: str = Field(description="The original actor's name")
  aliases: List[AliasItem] = Field(description="List of alternative names/references for this actor")
  
  def __str__(self) -> str:
    alias_str = "\n  ".join([f"- {item.alias}: sentences {item.sentences}" for item in self.aliases]) if self.aliases else "None"
    
    return (
        f"ACTOR: {self.actor}\n"
        f"  Aliases:\n  {alias_str}\n"
        f"{'-'*50}"
    )

class ActorList(BaseModel):
  actors: List[str] = Field(description="A list of actors who perform actions in the requirement.")
  
class ActorAliasMapping(BaseModel):
  mappings: List[ActorAlias] = Field(description="List of actor-alias mappings")

class UseCase(BaseModel):
  usecase: str = Field(description="The name of the use case (e.g., 'Place Order')")
  actor: str = Field(description="The actor who performs this use case")
  verb_phrase: str = Field(description="The extracted verb phrase")
  sentence: str = Field(description="The sentence where this use case was found")
  
  def __str__(self) -> str:
    return (
        f"USECASE: {self.usecase}\n"
        f"Actor: {self.actor}\n"
        f"Verb Phrase: {self.verb_phrase}\n"
        f"Sentence: {self.sentence}\n"
        f"{'-'*30}"
    )

In [94]:
# Khai báo State cho Graph
class GraphState(TypedDict):
  input_text: str     #Yêu cầu đầu vào của người dùng
  doc: Doc            #Doc sau khi gọi hàm nlp của SpaCy
  actors: List[str]   #Danh sách actors cuối cùng
  actor_aliases: List[ActorAlias] #Danh sách các alias của actor
  usecases: List[UseCase] #Danh sách use cases
  svo_elements: Annotated[List[Dict[str,str]], operator.add]
  validation_actor_result: ValidationResult

In [None]:
# actors node
def actors_node(state: GraphState):
  structured_llm = model.with_structured_output(ActorList)
  candidate_chunks = ", ".join(state["actors"])
  
  system_prompt = (
    "You are a Senior Systems Analyst and Linguistic Expert. Your task is to perform "
    "Entity Extraction specifically for 'Actors' in a system description or user story. "
    "An 'Actor' is defined as a person, organization, or external system that "
    "performs actions, initiates processes, or interacts with the system described."
  )

  user_prompt = f"""
  I will provide you with a raw text and a list of potential 'Noun Chunks' extracted by a parser.

  ### RULES:
  1. **Filtering**: From the 'Candidate Noun Chunks', select only those that function as an active agent (Actor) in the 'Raw Text'.
  2. **Standardization**: Convert all extracted actors to their **singular form** (e.g., 'customers' -> 'customer').
  3. **Cleaning**: Remove any unnecessary articles (a, an, the) and honorifics.
  4. **Context Check**: Ensure the noun chunk is actually performing an action in the text, not just being mentioned as an object.
  5. **Exclude Self-References**: Do NOT include 'the system', 'the software', or 'the application' as an Actor if it refers to the system being described. These are internal components, not external actors.
  6. **External Systems**: Only include other specific systems if they are external entities that your system interacts with (e.g., 'Payment Gateway', 'External Database').
  7. **Entity Resolution (STRICT)**: 
     - You must group noun chunks that represent the same **Logical Role**. 
     - **CRITERIA**: If 'user', 'customer', 'shopper', and 'buyer' are used interchangeably or perform the same sequence of actions, you MUST group them.
     - **PREFERENCE**: Choose the most domain-specific name (e.g., prefer 'customer' over 'user').
     - NEVER return both 'user' and 'customer' if they refer to the same person performing actions in the text.
  
  ### EXAMPLE:
  - Raw Text: "The user logs into the store. Then the customer searches for a product and the shopper adds it to the cart."
  - Candidate Noun Chunks: user, customer, shopper, product, cart
  - Analysis: 'user', 'customer', and 'shopper' all perform actions in the purchasing flow. They represent one logical role.
  - Final Actors: ["customer"]   
  
  ### INPUT DATA:
  - Raw Text: {state['input_text']}
  - Candidate Noun Chunks: {candidate_chunks}

  ### OUTPUT INSTRUCTIONS:
  Return a deduplicated list of singularized, canonical actor names. No synonyms allowed in the final list.
  """
    
  response: ActorList = structured_llm.invoke([
    ("system", system_prompt),
    ("human", user_prompt)
  ])
  
  # print(response)
  
  return {
    "actors": response.actors
  }

In [None]:
  # actors_alias_node

def actors_alias_node(state: GraphState):
  sentences = state.get("doc").sents
  
  structured_llm = model.with_structured_output(ActorAliasMapping)
  
  # Đầu vào cho LLMs
  input_sentences = [sentence.text.strip() for sentence in sentences]
  input_sentences = "\n".join([f"{i + 1}. {s}" for i, s in enumerate(input_sentences)])
  
  actors_list = ", ".join(state.get("actors"))
  
  system_prompt = (
    "You are an expert in Natural Language Processing and Entity Resolution. "
    "Your task is to identify all alternative references (aliases) to specific actors "
    "in a given text and map EACH UNIQUE ALIAS to the specific sentences where it appears."
  )
  
  user_prompt = f"""
  Analyze the following numbered sentences to perform Alias Resolution and Coreference Mapping for the specified Canonical Actors.

  ### CANONICAL ACTORS (Target Entities):
  {actors_list}

  ### PRE-SEGMENTED SENTENCES:
  {input_sentences}

  ### TASK:
  For each Canonical Actor in the list above, you must find EVERY mention of them in the text, including:
  1. **Exact Matches**: The actor's name itself.
  2. **Semantic Aliases (Synonyms)**: Different nouns used to refer to the same role (e.g., if 'shopper' or 'buyer' is used to refer to the 'customer' in this context).
  3. **Pronouns**: Words like "he", "she", "they", "him", "her", "their" that point back to the actor.
  4. **Specific Roles/Titles**: Variations like "the administrator" for "admin".

  ### CRITICAL RULES:
  - **Mapping**: Every alias found MUST be mapped to one of the provided Canonical Actors.
  - **Differentiation**: Treat "customer", "the customer", and "shopper" as separate alias entries, but group them under the SAME Canonical Actor.
  - **Sentence Tracking**: For each unique alias, provide the exact sentence numbers where it appears.
  - **Contextual Awareness**: Only map a synonym (like 'shopper') to a Canonical Actor (like 'customer') if the text clearly implies they are the same entity.

  ### OUTPUT EXAMPLE STRUCTURE:
  If the Actor is "customer":
  - "customer" appears in sentences [1, 5]
  - "the shopper" appears in sentences [2]
  - "he" appears in sentences [3, 6]

  ### FORMATTING REQUIREMENT:
  Return the mapping as a structured list of AliasItems for each actor. Ensure each alias variation is a distinct entry in the 'aliases' list.
  """
  
  response: ActorAliasMapping = structured_llm.invoke([
    ("system", system_prompt),
    ("human", user_prompt)
  ])
  
  
  # result = []
  
  # for item in response.mappings:
  #   item.sentences = [sentence[0] for sentence in item.sentences]
  #   print(item.sentences)
  # print(result)
  
  return {
    "actor_aliases": response.mappings
  }

In [97]:
#revalidate actors list

def actors_validate_node(state: GraphState):
  
  structured_llm = model.with_structured_output(ValidationResult)
  
  #input
  
  input_actors = state.get("actors")
  input_actors_alias = "\n".join([str(alias) for alias in state.get("actor_aliases")])
  input_text = state.get("input_text")
  
  system_prompt = """You are a data validation expert specializing in entity recognition and alias validation.

Your task is to validate actors (entities) and their aliases against the original raw input to ensure:
1. All actors mentioned in the raw input are captured
2. Aliases correctly refer to their corresponding actors
3. Sentence indices are valid and correctly reference sentences in the raw input
4. Aliases appear in the referenced sentences
5. No duplicate or conflicting aliases across different actors
6. Aliases are meaningful and contextually appropriate
7. All actors have complete information (name, aliases, sentence indices)

Note: Sentence indices start from 1 and reference the position of sentences in the raw input.


Return a structured validation result with:
- is_valid: boolean indicating if validation passed
- errors: list of specific validation errors found (critical issues)
- warnings: list of potential issues that don't fail validation
- suggestions: list of recommended improvements"""
  
  human_prompt = f"""Please validate the following actors and their aliases against the original raw input:

**Raw Input:**
{input_text}

**Actors List:**
{input_actors}

**Actor Aliases with Sentence Indices:**
{input_actors_alias}

Check for:
1. All actors from raw input are included in the actors list
2. Each actor has corresponding alias information
3. Sentence indices are valid (within range of sentences in raw input, starting from 1)
4. The actor or its aliases actually appear in the referenced sentences
5. No alias conflicts (same alias for different actors)
6. No duplicate aliases within the same actor
7. Sentence indices are correct and reference the right sentences
8. Format consistency across all ActorAlias entries
9. Completeness: each actor has at least name and sentence indices
10. No hallucinated actors or aliases not in raw input
11. No invalid sentence indices (e.g., index 0, negative numbers, or exceeding total sentence count)

Provide detailed validation results with specific examples of any issues found."""
  
  response: ValidationResult = structured_llm.invoke([
    ("system", system_prompt),
    ("human", human_prompt)
  ])
  
  print(response)
  
  return {
    "validation_actor_result": response
  }
  
def decide_after_actor_validation(state:GraphState):
  validation_result = state.get("validation_actor_result")
  if validation_result and validation_result.is_valid:
      return "valid"  # Tiếp tục flow bình thường
  else:
      return "invalid"  # Cần xử lý lại hoặc sửa lỗi

In [98]:
#define usecase node

def define_usecase_node(state: GraphState):
  
  
  
  return state

In [99]:
workflow = StateGraph(GraphState)

workflow.add_node("actors_node", actors_node)
workflow.add_node("actors_alias_node", actors_alias_node)
# workflow.add_node("actors_validate_node", actors_validate_node)


workflow.add_edge(START, "actors_node")
workflow.add_edge("actors_node", "actors_alias_node")
workflow.add_edge("actors_alias_node", END)

# workflow.add_edge("actors_alias_node", "actors_validate_node")
# workflow.add_conditional_edges(
#   "actors_validate_node",
#   decide_after_actor_validation,
#   {
#     "valid": END,
#     "invalid": "actors_node"
#   }
# )

app = workflow.compile()

In [100]:
with open("./input.txt", "r", encoding="UTF-8") as f:
  input = f.read()
  
input = preprocessing.normalize.whitespace(input)

nlp = spacy.load("en_core_web_lg")
doc = nlp(input)

chunks = extract.noun_chunks(doc, min_freq=1)
chunks = [chunk.text for chunk in chunks if chunk.root.pos_ != 'PRON' and chunk.root.lemma_.lower() not in INTERNAL_SYSTEM_KEYWORDS]

#Trạng thái khởi tạo ban đầu cho process
initial_state = {
  "input_text": input,
  "doc": doc,
  "actors": chunks,
  "actor_aliases": [],
  "usecases": [],
  "svo_elements": [],
}

result: GraphState = app.invoke(initial_state)

print("\n=== ACTORS ===")
print(result.get("actors"))
print("\n=== ACTOR ALIASES ===")
for item in result.get("actor_aliases"):
  print(item)


=== ACTORS ===
['customer', 'payment gateway', 'store manager']

=== ACTOR ALIASES ===
ACTOR: customer
  Aliases:
  - A user: sentences [1]
  - The customer: sentences [2]
  - the shopper: sentences [3]
  - they: sentences [3]
  - The buyer: sentences [4]
--------------------------------------------------
ACTOR: payment gateway
  Aliases:
  - the Payment Gateway: sentences [5]
  - It: sentences [6]
--------------------------------------------------
ACTOR: store manager
  Aliases:
  - the Store Manager: sentences [7]
  - he: sentences [7]
--------------------------------------------------
