# Importing required lib:

In [5]:
import nltk
import spacy
import re
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

# Manually download necessary resources

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
import subprocess
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])

CompletedProcess(args=['python', '-m', 'spacy', 'download', 'en_core_web_sm'], returncode=0)

# Preprocessing: Step 1

In [6]:
def preprocess_text(text):
    """
    Cleans the input text by removing special characters, tokenizing, and filtering stopwords.
    """
    # Convert text to lowercase
    text = text.lower()

    # Remove special characters
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)

    # Tokenize into sentences
    sentences = sent_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    cleaned_sentences = []
    for sentence in sentences:
        words = sentence.split()
        filtered_words = [word for word in words if word not in stop_words]
        cleaned_sentences.append(" ".join(filtered_words))

    return cleaned_sentences

# Example Input
text = "Rahul wakes up early. He has to buy snacks for all of us."
print(preprocess_text(text))

['rahul wakes early buy snacks us']


# Task Identification: Step 2

In [8]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")



In [11]:
def extract_tasks(sentences):
    """
    Identifies sentences containing actionable tasks using refined heuristics.
    Returns a list of extracted tasks.
    """
    task_sentences = []
    obligation_keywords = {"has to", "should", "must", "need to", "required to", "is expected to"}

    for sentence in sentences:
        doc = nlp(sentence)
        verbs = [token.text for token in doc if token.pos_ == "VERB"]

        # Check if the sentence contains an obligation phrase
        contains_obligation = any(phrase in sentence.lower() for phrase in obligation_keywords)

        # Check if the sentence contains a deadline-related word
        contains_deadline = any(token.text.lower() in {"by", "before", "tomorrow", "next week"} for token in doc)

        # A valid task sentence should either contain an obligation phrase or a deadline
        if verbs and (contains_obligation or contains_deadline):
            task_sentences.append(sentence)

    return task_sentences

# Example Sentences
sentences = [
    "Rahul wakes up early.",  # Not a task
    "He has to buy snacks for all of us.",  #  Task
    "We are watching a movie.",  #  Not a task
    "She should complete the assignment by tomorrow.",  #  Task
    "They need to submit the report before Monday.",  #  Task
    "John is going to school.",  #  Not a task
    "Alex must clean the kitchen today."  #  Task
]

# Extract tasks
tasks = extract_tasks(sentences)
print("Extracted Tasks:", tasks)

Extracted Tasks: ['He has to buy snacks for all of us.', 'She should complete the assignment by tomorrow.', 'They need to submit the report before Monday.', 'Alex must clean the kitchen today.']


# Categorization: Step 3

In [12]:
def categorize_tasks(task_list):
    """
    Categorizes tasks into predefined categories based on keywords.
    """
    categories = {
        "Shopping": ["buy", "purchase", "order"],
        "Work/Study": ["complete", "submit", "write", "review", "study"],
        "Household Chores": ["clean", "wash", "organize", "fix", "repair"],
        "Meetings/Appointments": ["schedule", "attend", "meet", "call"]
    }

    categorized_tasks = {}

    for task in task_list:
        assigned_category = "Uncategorized"  # Default category if no match

        for category, keywords in categories.items():
            if any(keyword in task.lower() for keyword in keywords):
                assigned_category = category
                break  # Stop checking once a category is found

        # Add to categorized dictionary
        if assigned_category in categorized_tasks:
            categorized_tasks[assigned_category].append(task)
        else:
            categorized_tasks[assigned_category] = [task]

    return categorized_tasks

# Example Tasks from Step 2
tasks = [
    "He has to buy snacks for all of us.",
    "She should complete the assignment by tomorrow.",
    "They need to submit the report before Monday.",
    "Alex must clean the kitchen today.",
    "John has to attend a meeting at 5 PM."
]

# Categorize Tasks
task_categories = categorize_tasks(tasks)
print("Task Categories:", task_categories)

Task Categories: {'Shopping': ['He has to buy snacks for all of us.'], 'Work/Study': ['She should complete the assignment by tomorrow.', 'They need to submit the report before Monday.'], 'Household Chores': ['Alex must clean the kitchen today.'], 'Meetings/Appointments': ['John has to attend a meeting at 5 PM.']}


# Output: Step 4

In [13]:
import re

def extract_who_and_when(task):
    """
    Extracts the responsible person (who) and deadline (when) from a task description.
    Uses simple heuristics and regex for 'who' and 'when' extraction.
    """
    # Extract 'who' (simple keyword-based approach)
    who = re.search(r"\b(he|she|they|alex|john|rahul|you)\b", task, re.IGNORECASE)
    who = who.group(0) if who else "Unknown"  # Default to 'Unknown' if not found

    # Extract 'when' (looking for time-related keywords or phrases)
    when = None
    deadlines = re.findall(r"\b(by \d{1,2} (AM|PM)|tomorrow|today|at \d{1,2} (AM|PM)|before \w+)\b", task, re.IGNORECASE)
    if deadlines:
        when = deadlines[0][0]  # Get the first matched deadline phrase

    return who, when

def categorize_tasks_with_details(task_list):
    """
    Categorizes tasks into predefined categories and adds 'who' and 'when' details.
    """
    categories = {
        "Shopping": ["buy", "purchase", "order"],
        "Work/Study": ["complete", "submit", "write", "review", "study"],
        "Household Chores": ["clean", "wash", "organize", "fix", "repair"],
        "Meetings/Appointments": ["schedule", "attend", "meet", "call"]
    }

    categorized_tasks = {}

    for task in task_list:
        # First, categorize the task
        assigned_category = "Uncategorized"  # Default category if no match
        for category, keywords in categories.items():
            if any(keyword in task.lower() for keyword in keywords):
                assigned_category = category
                break  # Stop checking once a category is found

        # Now, extract who is responsible and when the task should be done
        who, when = extract_who_and_when(task)

        # Add to categorized dictionary
        task_details = {
            "task": task,
            "who": who,
            "deadline": when,
            "category": assigned_category
        }

        if assigned_category in categorized_tasks:
            categorized_tasks[assigned_category].append(task_details)
        else:
            categorized_tasks[assigned_category] = [task_details]

    return categorized_tasks

# Example Tasks from Step 2
tasks = [
    "Rahul has to buy snacks for all of us.",
    "She should complete the assignment by tomorrow.",
    "They need to submit the report before Monday.",
    "Alex must clean the kitchen today.",
    "John has to attend a meeting at 5 PM."
]

# Categorize Tasks and Extract Who and When
task_categories = categorize_tasks_with_details(tasks)

# Output the Structured Tasks
import pandas as pd
tasks_df = pd.DataFrame([
    {
        "task": task['task'],
        "who": task['who'],
        "deadline": task['deadline'],
        "category": task['category']
    }
    for category in task_categories.values() for task in category
])

# Display the DataFrame
tasks_df.to_csv('tasks_output.csv', index=False)
print(tasks_df)

                                              task    who       deadline  \
0           Rahul has to buy snacks for all of us.  Rahul           None   
1  She should complete the assignment by tomorrow.    She       tomorrow   
2    They need to submit the report before Monday.   They  before Monday   
3               Alex must clean the kitchen today.   Alex          today   
4            John has to attend a meeting at 5 PM.   John        at 5 PM   

                category  
0               Shopping  
1             Work/Study  
2             Work/Study  
3       Household Chores  
4  Meetings/Appointments  
