In [134]:
from typing import Dict, List, TypedDict
import spacy
from textacy import extract, preprocessing
from typing import Annotated
import operator
from langgraph.graph import StateGraph, START, END
from spacy.tokens import Doc
from langchain.chat_models import init_chat_model
from dotenv import load_dotenv
from pydantic import BaseModel, Field

In [135]:
load_dotenv()

#Danh sách từ không được tính là Actors
INTERNAL_SYSTEM_KEYWORDS = {"system", "software", "application", "app", "platform"}

model = init_chat_model(
  "gpt-5-mini",
  model_provider="openai",
)

In [136]:
class ActorAlias(BaseModel):
  actor: str = Field(description="The original actor's name")
  aliases: List[str] = Field(description="List of alternative names/references for this actor")
  sentences: List[str] = Field(description="Sentences where the actor or its aliases appear")
  
  def __str__(self) -> str:
    # Gom các alias lại bằng dấu phẩy
    alias_str = ", ".join(self.aliases) if self.aliases else "None"
    
    # Format các câu sentence thành dạng danh sách có gạch đầu dòng
    sentences_str = "\n    - ".join(self.sentences)
    
    return (
        f"ACTOR: {self.actor}\n"
        f"Aliases: {alias_str}\n"
        f"Sentences:\n    - {sentences_str}\n"
        f"{'-'*30}"
    )
  
class ActorList(BaseModel):
  actors: List[str] = Field(description="A list of actors who perform actions in the requirement.")
  
class ActorAliasMapping(BaseModel):
  mappings: List[ActorAlias] = Field(description="List of actor-alias mappings")

In [137]:
# Khai báo State cho Graph
class GraphState(TypedDict):
  input_text: str     #Yêu cầu đầu vào của người dùng
  doc: Doc            #Doc sau khi gọi hàm nlp của SpaCy
  actors: List[str]   #Danh sách actors cuối cùng
  actor_aliases: List[ActorAlias] #Danh sách các alias của actor
  svo_elements: Annotated[List[Dict[str,str]], operator.add]

In [138]:
# actors node
def actors_node(state: GraphState):
  structured_llm = model.with_structured_output(ActorList)
  candidate_chunks = ", ".join(state["actors"])
  
  system_prompt = (
    "You are a Senior Systems Analyst and Linguistic Expert. Your task is to perform "
    "Entity Extraction specifically for 'Actors' in a system description or user story. "
    "An 'Actor' is defined as a person, organization, or external system that "
    "performs actions, initiates processes, or interacts with the system described."
)

  user_prompt = f"""
  I will provide you with a raw text and a list of potential 'Noun Chunks' extracted by a parser.

  ### RULES:
  1. **Filtering**: From the 'Candidate Noun Chunks', select only those that function as an active agent (Actor) in the 'Raw Text'.
  2. **Standardization**: Convert all extracted actors to their **singular form** (e.g., 'customers' -> 'customer').
  3. **Cleaning**: Remove any unnecessary articles (a, an, the) and honorifics.
  4. **Context Check**: Ensure the noun chunk is actually performing an action in the text, not just being mentioned as an object.
  5. **Exclude Self-References**: Do NOT include 'the system', 'the software', or 'the application' as an Actor if it refers to the system being described. These are internal components, not external actors.
  6. **External Systems**: Only include other specific systems if they are external entities that your system interacts with (e.g., 'Payment Gateway', 'External Database').
  
  ### INPUT DATA:
  - Raw Text: {state['input_text']}
  - Candidate Noun Chunks: {candidate_chunks}

  ### OUTPUT INSTRUCTIONS:
  Return only the final list of singularized actors.
  """
    
  response: ActorList = structured_llm.invoke([
    ("system", system_prompt),
    ("human", user_prompt)
  ])
  
  # print(response)
  
  return {
    "actors": response.actors
  }

In [139]:
# actors_alias_node

def actors_alias_node(state: GraphState):
  sentences = state.get("doc").sents
  
  structured_llm = model.with_structured_output(ActorAliasMapping)
  
  # Đầu vào cho LLMs
  input_sentences = [sentence.text.strip() for sentence in sentences]
  input_sentences = "\n".join([f"{i + 1}. {s}" for i, s in enumerate(input_sentences)])
  
  actors_list = ", ".join(state.get("actors"))
  
  system_prompt = (
    "You are an expert in Natural Language Processing and Entity Resolution. "
    "Your task is to identify all alternative references (aliases) to specific actors "
    "in a given text and determine which sentences contain these references."
  )
  
  user_prompt = f"""
  Analyze the following pre-segmented numbered sentences and identify all references to the given actors.

  ### ACTORS TO TRACK:
  {actors_list}

  ### PRE-SEGMENTED SENTENCES:
  {input_sentences}

  ### ADVANTAGES OF PRE-SEGMENTATION:
  - Sentences are already properly split by a trained NLP model
  - Sentence boundaries are accurate (handles abbreviations, numbers, etc.)
  - You can focus on coreference resolution instead of sentence splitting

  ### TASK:
  For each actor:
  1. Find all ways it is referenced (direct mentions, pronouns, role titles, etc.)
  2. List all unique aliases/references  
  3. Reference sentences by their NUMBER (e.g., "1", "3", "5")

  ### COREFERENCE RULES:
  - Track pronoun chains across sentences (e.g., sent 1: "customer" -> sent 2: "he" -> sent 3: "the user")
  - Resolve possessive pronouns (his, her, their) to their antecedents
  - Identify role-based references (the admin, the buyer, etc.)
  - Connect demonstrative references (this person, that user)
  - Consider sentence proximity for ambiguous references

  ### OUTPUT FORMAT:
  In the 'sentences' field, include BOTH the number AND the full text:
  Example: "1. The customer logs in."

  This allows for easy verification and mapping back to the source.
  """
  
  response: ActorAliasMapping = structured_llm.invoke([
    ("system", system_prompt),
    ("human", user_prompt)
  ])
  
  return {
    "actor_aliases": response.mappings
  }

In [140]:
workflow = StateGraph(GraphState)

workflow.add_node("actors_node", actors_node)
workflow.add_node("actors_alias_node", actors_alias_node)


workflow.add_edge(START, "actors_node")
workflow.add_edge("actors_node", "actors_alias_node")
workflow.add_edge("actors_alias_node", END)

app = workflow.compile()

In [142]:
with open("./input.txt", "r", encoding="UTF-8") as f:
  input = f.read()
  
input = preprocessing.normalize.whitespace(input)

nlp = spacy.load("en_core_web_lg")
doc = nlp(input)

chunks = extract.noun_chunks(doc, min_freq=1)
chunks = [chunk.text for chunk in chunks if chunk.root.pos_ != 'PRON' and chunk.root.lemma_.lower() not in INTERNAL_SYSTEM_KEYWORDS]


#Trạng thái khởi tạo ban đầu cho process
initial_state = {
  "input_text": input,
  "doc": doc,
  "actors": chunks,
  "actor_aliases": [],
  "svo_elements": [],
}

result: GraphState = app.invoke(initial_state)

print(result.get("actors"))
for item in result.get("actor_aliases"):
  print(item)

['librarian', 'student', 'automated kiosk', 'administrator']
ACTOR: librarian
Aliases: The librarians, librarians
Sentences:
    - 1. The librarians manage the digital catalog by adding new book records and updating existing entries.
------------------------------
ACTOR: student
Aliases: a student, the student, their, borrowers, the borrowers, student
Sentences:
    - 2. When a student searches for a book, the system displays the availability status in real-time.
    - 3. If the student decides to borrow a book, the automated kiosk scans their ID card and records the transaction in the database.
    - 4. After the loan period expires, the management system sends email notifications to the borrowers.
------------------------------
ACTOR: automated kiosk
Aliases: the automated kiosk, automated kiosk
Sentences:
    - 3. If the student decides to borrow a book, the automated kiosk scans their ID card and records the transaction in the database.
------------------------------
ACTOR: adminis