## Module 1: NER and POS Tagging

In [1]:
import re
import uuid
import json
from typing import List, Dict, Any, Optional
from datetime import datetime

import spacy
from spacy.matcher import Matcher
from pydantic import BaseModel, Field, ValidationError
import dateparser
import google.generativeai as genai


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
GEMINI_API_KEY = "AIzaSyDwNbpu5hvUJ2wZYXMY14zQlsevLdlr2qw"
genai.configure(api_key=GEMINI_API_KEY)

Creation of the Entity Class

In [3]:
class Entity(BaseModel):
    text: str
    label: str
    start: int
    end: int
    confidence: float = 1.0


class EntityCollection(BaseModel):
    conversation_id: str
    timestamp: str
    raw_text: str
    sentences: List[str]
    entities: List[Entity]

    def to_endpoint_json(self) -> Dict[str, Any]:
        return {
            "conversation_id": self.conversation_id,
            "input": [{"role": "user", "content": self.raw_text}],
            "sentences": self.sentences,
            "entities": [e.dict() for e in self.entities],
        }


class FinalStructuredOutput(BaseModel):
    """This is the final JSON we expect after the LLM post-processor."""
    conversation_id: str
    input: List[Dict[str, Any]]
    sentences: List[str]
    entities: List[Entity]
    # Optional, but useful for downstream routing
    contextual_tags: Optional[Dict[str, Any]] = Field(default=None)


In [4]:
class TextPreprocessor:
    """Cleans elderly speech before NLP: casing, fillers, en-SG removal, basic typo-normalisation."""

    _FILLERS = [
        r"\buh+\b", r"\bum+\b", r"\bah+\b", r"\bhmm+\b", r"\byou know\b",
        r"\blike\b", r"\bkind of\b", r"\bsort of\b",
    ]
    _EN_SG = [r"\blah\b", r"\bleh\b", r"\blor\b", r"\bmeh\b", r"\bsia\b", r"\bhor\b"]

    def __init__(self, keep_case: bool = True):
        self.keep_case = keep_case

    def sentence_segment(self, text: str) -> List[str]:
        segs = re.split(r"(?<=[\.\!\?])\s+", text.strip())
        segs = [s.strip() for s in segs if s.strip()]
        return segs if segs else [text.strip()]

    def normalize(self, text: str) -> str:
        t = text.strip()
        if not self.keep_case:
            t = t.lower()
        for pat in self._FILLERS + self._EN_SG:
            t = re.sub(pat, "", t, flags=re.IGNORECASE)
        t = re.sub(r"\bI\'m\b", "I am", t)
        t = re.sub(r"\bcan\'t\b", "cannot", t)
        t = re.sub(r"\bwon\'t\b", "will not", t)
        # Collapse spaces
        t = re.sub(r"\s+", " ", t).strip()
        return t

    def process(self, text: str) -> Dict[str, Any]:
        cleaned = self.normalize(text)
        sentences = self.sentence_segment(cleaned)
        return {"cleaned": cleaned, "sentences": sentences}


In [18]:
class NER_POS_Extractor:
    """
    Extracts both NER (semantic entities like DATE, ORG, EVENT) 
    and POS (grammatical roles like VERB, NOUN, etc.) and embeds the POS tags
    into the sentence for easier processing by downstream models (e.g., Gemini).
    """

    def __init__(self, model: str = "en_core_web_lg"):
        self.nlp = spacy.load(model)

    def embed_pos_in_text(self, text: str) -> str:
        """
        Embeds POS tags directly next to the word (e.g., I <NOUN> love <VERB> playing).
        Also captures verbs as <ACTION>.
        """
        doc = self.nlp(text)
        pos_embedded_text = text

        # Loop over the tokens and insert POS tags after each token
        for token in reversed(doc):
            pos_tag = token.pos_  # Extract the POS tag (e.g., "NOUN", "VERB")
            token_text = token.text

            # If the POS tag is a verb, mark it as an action
            if pos_tag == "VERB":
                pos_tag = "ACTION"  # Custom tag for actions (verbs)

            # Insert POS tag directly after the word in the text
            pos_embedded_text = (
                pos_embedded_text[:token.idx + len(token_text)]  # Text up to the token
                + f" <{pos_tag}>"  # POS tag after the word (or ACTION for verbs)
                + pos_embedded_text[token.idx + len(token_text):]  # Text after the token
            )

        return pos_embedded_text

    def extract(self, cleaned_text: str, sentences: List[str]) -> EntityCollection:
        doc = self.nlp(cleaned_text)
        entities: List[Entity] = []

        # -------------------------------
        # 1) NER: Semantic entity labels
        # -------------------------------
        for ent in doc.ents:
            entities.append(
                Entity(
                    text=ent.text,
                    label=f"NER_{ent.label_}",  # make it clear this came from NER
                    start=ent.start_char,
                    end=ent.end_char,
                    confidence=1.0
                )
            )

        # -------------------------------
        # 2) Embed POS tags into the text for downstream models like Gemini
        # -------------------------------
        pos_embedded_text = self.embed_pos_in_text(cleaned_text)

        # -------------------------------
        # 3) Wrap results in EntityCollection
        # -------------------------------
        return EntityCollection(
            conversation_id=str(uuid.uuid4()),
            timestamp=datetime.utcnow().isoformat(),
            raw_text=pos_embedded_text,  # The processed text with POS tags embedded
            sentences=sentences,
            entities=entities
        )

In [24]:
class GeminiPostProcessor:
    def __init__(self, api_key: str, model_name: str = "gemini-2.5-flash-lite"):
        genai.configure(api_key=api_key)
        self.model = genai.GenerativeModel(model_name)

    def parse(self, collection: EntityCollection) -> Dict[str, Any]:
        print(collection.raw_text)
        system_prompt = """You are an assistant that converts elderly conversation into structured JSON.
          The language used will also be singlish based, attempt to understand the singlish nuances within the statements.
          The schema must look like this:

          {
            "conversation_id": "...",
            "input": [{"role": "user", "content": "..."}],
            "sentences": [...],
            "entities": [
              {"text": "...", "label": "ACTIVITY/FOOD/DATE/EVENT/FACILITY/ACTION", "start": ..., "end": ...}
            ]
          }
          """

        user_prompt = f"""
          Conversation text (POS-tagged):
          {collection.raw_text}

          NER hints:
          {json.dumps([e.dict() for e in collection.entities if e.label.startswith("NER_")], indent=2)}

          Action hints:
          {json.dumps([e.dict() for e in collection.entities if e.label == "ACTION"], indent=2)}
          """

        # Get response from the model
        response = self.model.generate_content([system_prompt, user_prompt])
        result_text = response.text.strip()

        # --- Clean output ---
        if result_text.startswith("```json") and result_text.endswith("```"):
            result_text = result_text[7:-3].strip()  # Remove the ```json and closing ```

        # 2. Extract the JSON content between braces {}
        match = re.search(r"\{.*\}", result_text, re.DOTALL)
        if match:
            result_text = match.group(0)  # Extract valid JSON

        # --- Try to parse the cleaned JSON ---
        try:
            return json.loads(result_text)
        except Exception as e:
            return {"error": f"Invalid JSON from Gemini: {e}", "raw": result_text}

    def embed_actions_in_entities(self, collection: EntityCollection) -> EntityCollection:
        """
        Identifies all verbs in the raw text and adds them as entities with the label "ACTION".
        """
        doc = self.model.nlp(collection.raw_text)
        action_entities = []

        # Identify verbs (actions) and create Entity objects for them
        for token in doc:
            if token.pos_ == "VERB":
                action_entities.append(
                    Entity(
                        text=token.text,
                        label="ACTION",
                        start=token.idx,
                        end=token.idx + len(token.text)
                    )
                )

        # Add action entities to the existing collection of entities
        collection.entities.extend(action_entities)
        return collection

In [25]:
class NaturalLanguageToJSONPipeline:
    def __init__(self):
        self.pre = TextPreprocessor()
        self.extractor = NER_POS_Extractor()
        self.llm = GeminiPostProcessor(api_key=GEMINI_API_KEY)

    def run(self, text: str) -> Dict[str, Any]:
        print("Beginnnign Preprocessing of texts")
        prep = self.pre.process(text)
        print("done")

        cleaned, sentences = prep["cleaned"], prep["sentences"]
        print("Starting NER POS Extraction Process")
        collection = self.extractor.extract(cleaned, sentences)
        print("Done")

        print("Starting LLM Parse")
        structured = self.llm.parse(collection)
        print("Done")

        try:
            validated = FinalStructuredOutput(**structured)
            return validated.dict()
        except ValidationError as e:
            return {"error": "Validation failed", "details": e.errors(), "raw": structured}

In [28]:
import json

if __name__ == "__main__":
    # Example elderly conversation input
    text = (
        "wa damn sian sia, forgot to eat my medicine sia"
    )

    # Build pipeline
    pipeline = NaturalLanguageToJSONPipeline()

    # Run pipeline on input text
    output = pipeline.run(text)

    # Pretty-print JSON output
    print(json.dumps(output, indent=2, ensure_ascii=False))

Beginnnign Preprocessing of texts
done
Starting NER POS Extraction Process
Done
Starting LLM Parse
wa <INTJ> damn <INTJ> sian <PROPN> , <PUNCT> forgot <ACTION> to <PART> eat <ACTION> my <PRON> medicine <NOUN>
Done
{
  "conversation_id": "conv_001",
  "input": [
    {
      "role": "user",
      "content": "wa damn sian, forgot to eat my medicine"
    }
  ],
  "sentences": [
    "wa damn sian, forgot to eat my medicine"
  ],
  "entities": [
    {
      "text": "forgot to eat",
      "label": "ACTION",
      "start": 14,
      "end": 27,
      "confidence": 1.0
    },
    {
      "text": "medicine",
      "label": "FOOD",
      "start": 31,
      "end": 39,
      "confidence": 1.0
    }
  ],
  "contextual_tags": null
}


C:\Users\Tay Han\AppData\Local\Temp\ipykernel_32932\1596187666.py:23: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  return validated.dict()


In [1]:
# Environment setup
pip install -U "transformers>=4.44" "datasets>=2.20" "accelerate>=0.33" "peft>=0.11.0" evaluate sacrebleu


SyntaxError: invalid syntax (2410008431.py, line 2)