In [22]:
import spacy
import re
import json
from collections import defaultdict

In [23]:
# Load spaCy English model
nlp = spacy.load("en_core_web_sm")


In [24]:
# Task-related phrases
TASK_KEYWORDS = [
    "has to", "needs to", "should", "must", "is required to", "is expected to",
    "is supposed to", "is scheduled to", "is assigned to"
]

In [25]:
# Deadline patterns
TIME_PATTERNS = [
    r'by \d{1,2}\s*(am|pm)?',  # Matches "by 5 pm" or "by 7"
    r'before \w+',  # Matches "before tomorrow"
    r'tomorrow',  # Matches "tomorrow"
    r'today',  # Matches "today"
    r'in \d+ \w+',  # Matches "in 3 hours"
    r'by end of the day',  # Matches "by end of the day"
    r'within \d+ (hours|days|minutes)'  # Matches "within 3 hours"
]

In [26]:
# Categorization keywords
TASK_CATEGORIES = {
    "Personal": ["buy", "get", "shop", "visit"],
    "Academic": ["submit", "study", "complete", "assignment", "exam", "project"],
    "Work": ["send", "email", "call", "schedule", "meeting", "review"],
    "Household": ["clean", "wash", "cook", "arrange", "fix"],
    "Health": ["exercise", "run", "walk", "meditate"],
    "Finance": ["pay", "invest", "deposit", "withdraw", "budget"]
}

In [27]:
def extract_and_categorize_pipeline(text):

    doc = nlp(text)
    extracted_tasks = []

    for sent in doc.sents:
        sentence = sent.text.strip()
        sentence_lower = sentence.lower()

        if any(keyword in sentence_lower for keyword in TASK_KEYWORDS):
            task = {"who": "Unknown", "task": None, "deadline": None, "category": "Uncategorized"}

            # Extract subject (who is assigned the task)
            subjects = [token.text for token in sent if token.dep_ in {"nsubj", "nsubjpass"} and token.pos_ in {"PROPN", "PRON"}]
            if subjects:
                task["who"] = subjects[0]  # Take the first detected subject

            # Extract task description
            for keyword in TASK_KEYWORDS:
                if keyword in sentence_lower:
                    task_start = sentence_lower.find(keyword) + len(keyword)
                    task_text = sentence[task_start:].strip()
                    task["task"] = re.sub(r'\bby \d{1,2}\s*(am|pm)?\b', '', task_text).strip()
                    break

            # Extract deadline
            for pattern in TIME_PATTERNS:
                match = re.search(pattern, sentence_lower)
                if match:
                    task["deadline"] = match.group().strip()
                    break

            # Categorize task
            for category, keywords in TASK_CATEGORIES.items():
                if any(word in sentence_lower for word in keywords):
                    task["category"] = category
                    break

            if task["task"]:
                extracted_tasks.append(task)

    return extracted_tasks


In [28]:
def format_output(tasks):

    return json.dumps(tasks, indent=4)


In [29]:
# Test Cases
test_texts = [
    "Rahul wakes up early every day. He goes to college in the morning and comes back at 3 pm. At present, Rahul is outside. He has to buy the snacks for all of us. He also needs to submit his assignment by 5 pm.",
    "John should complete the project before Friday. Alice must send the email by 10 am tomorrow. They have to attend the meeting at 2 pm.",
    "David is required to pay the electricity bill today. Sarah needs to visit the dentist by the end of the day. James should invest in stocks next month.",
    "Maya has to clean the kitchen by 6 pm. Adam is supposed to arrange the books in the library. Tom needs to exercise in the morning."
]

# Run NLP pipeline
for idx, text in enumerate(test_texts, 1):
    print(f"\nTest Case {idx}:")
    extracted_tasks = extract_and_categorize_pipeline(text)
    print(format_output(extracted_tasks))



Test Case 1:
[
    {
        "who": "He",
        "task": "buy the snacks for all of us.",
        "deadline": null,
        "category": "Personal"
    },
    {
        "who": "He",
        "task": "submit his assignment .",
        "deadline": "by 5 pm",
        "category": "Academic"
    }
]

Test Case 2:
[
    {
        "who": "John",
        "task": "complete the project before Friday.",
        "deadline": "before friday",
        "category": "Academic"
    },
    {
        "who": "Alice",
        "task": "send the email  tomorrow.",
        "deadline": "by 10 am",
        "category": "Work"
    }
]

Test Case 3:
[
    {
        "who": "David",
        "task": "pay the electricity bill today.",
        "deadline": "today",
        "category": "Finance"
    },
    {
        "who": "Sarah",
        "task": "visit the dentist by the end of the day.",
        "deadline": null,
        "category": "Personal"
    },
    {
        "who": "James",
        "task": "invest in stocks next m