In [110]:
from typing import Dict, List, TypedDict
import spacy
from textacy import extract, preprocessing
from typing import Annotated
import operator
from langgraph.graph import StateGraph, START, END
from spacy.tokens import Doc
from langchain.chat_models import init_chat_model
from dotenv import load_dotenv
from pydantic import BaseModel, Field

In [None]:
load_dotenv()

#Danh sách từ không được tính là Actors
INTERNAL_SYSTEM_KEYWORDS = {"system", "software", "application", "app", "platform"}

model = init_chat_model(
  "gpt-5-mini",
  model_provider="openai",
)

In [112]:
# Khai báo State cho Graph
class GraphState(TypedDict):
  input_text: str     #Yêu cầu đầu vào của người dùng
  doc: Doc            #Doc sau khi gọi hàm nlp của SpaCy
  actors: List[str]   #Danh sách actors cuối cùng
  svo_elements: Annotated[List[Dict[str,str]], operator.add]

In [113]:
class ActorList(BaseModel):
  actors: List[str] = Field(description="Danh sách các tác nhân (Actors) thực hiện hành động trong văn bản")

In [114]:
# actors node
def actors_node(state: GraphState):
  structured_llm = model.with_structured_output(ActorList)
  candidate_chunks = ", ".join(state["actors"])
  
  system_prompt = (
    "You are a Senior Systems Analyst and Linguistic Expert. Your task is to perform "
    "Entity Extraction specifically for 'Actors' in a system description or user story. "
    "An 'Actor' is defined as a person, organization, or external system that "
    "performs actions, initiates processes, or interacts with the system described."
)

  user_prompt = f"""
  I will provide you with a raw text and a list of potential 'Noun Chunks' extracted by a parser.

  ### RULES:
  1. **Filtering**: From the 'Candidate Noun Chunks', select only those that function as an active agent (Actor) in the 'Raw Text'.
  2. **Standardization**: Convert all extracted actors to their **singular form** (e.g., 'customers' -> 'customer').
  3. **Cleaning**: Remove any unnecessary articles (a, an, the) and honorifics.
  4. **Context Check**: Ensure the noun chunk is actually performing an action in the text, not just being mentioned as an object.
  5. **Exclude Self-References**: Do NOT include 'the system', 'the software', or 'the application' as an Actor if it refers to the system being described. These are internal components, not external actors.
  6. **External Systems**: Only include other specific systems if they are external entities that your system interacts with (e.g., 'Payment Gateway', 'External Database').
  
  ### INPUT DATA:
  - Raw Text: {state['input_text']}
  - Candidate Noun Chunks: {state['actors']}

  ### OUTPUT INSTRUCTIONS:
  Return only the final list of singularized actors.
  """
    
  response = structured_llm.invoke([
    ("system", system_prompt),
    ("human", user_prompt)
  ])
  
  # print(response)
  
  return {
    "actors": response.actors
  }

In [115]:
workflow = StateGraph(GraphState)

workflow.add_node("actors_node", actors_node)

workflow.add_edge(START, "actors_node")
workflow.add_edge("actors_node", END)

app = workflow.compile()

In [116]:
with open("./input.txt", "r", encoding="UTF-8") as f:
  input = f.read()
  
input = preprocessing.normalize.whitespace(input)

nlp = spacy.load("en_core_web_lg")
doc = nlp(input)

chunks = extract.noun_chunks(doc, min_freq=1)
chunks = [chunk.text for chunk in chunks if chunk.root.pos_ != 'PRON' and chunk.root.lemma_.lower() not in INTERNAL_SYSTEM_KEYWORDS]

initial_state = {
  "input_text": input,
  "doc": doc,
  "actors": chunks,
  "svo_elements": []
}

result = app.invoke(initial_state)

print(result.get("actors"))

['librarian', 'student', 'automated kiosk', 'administrator']
