In [6]:
# First, install all required packages
# First, install all required packages
!pip install -q nltk spacy gensim scikit-learn

import nltk
import spacy
import re
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')
# Download the 'averaged_perceptron_tagger_eng' data package
nltk.download('averaged_perceptron_tagger_eng') # This line is added to download the required data


# Download all required NLTK data
print("Downloading NLTK resources...")
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
# Download the 'punkt_tab' data package
nltk.download('punkt_tab') # This line is added to download the required data
print("NLTK resources downloaded successfully!")

# Download spaCy model
print("Downloading spaCy model...")
!python -m spacy download en_core_web_sm
print("spaCy model downloaded successfully!")
# ... (rest of your code remains the same) ...

class TaskExtractor:
    def __init__(self):
        self.nlp = spacy.load('en_core_web_sm')
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

        # Action verbs that typically indicate tasks
        self.action_verbs = {
            'complete', 'finish', 'submit', 'prepare', 'review', 'create',
            'update', 'send', 'schedule', 'organize', 'buy', 'clean', 'write',
            'implement', 'develop', 'design', 'analyze', 'investigate'
        }

        # Modal verbs and obligation words
        self.modal_words = {
            'must', 'should', 'need', 'have to', 'has to', 'needs to',
            'required to', 'supposed to'
        }

    def preprocess_text(self, text):
        # Basic text cleaning
        text = text.lower()
        text = re.sub(r'[^\w\s.]', ' ', text)
        return text

    def extract_sentences(self, text):
        # Using spaCy for sentence tokenization instead of NLTK
        doc = self.nlp(text)
        return [sent.text.strip() for sent in doc.sents]

    def is_task_sentence(self, sentence):
        doc = self.nlp(sentence.lower())

        # Check for modal verbs and obligation words
        has_modal = any(modal in sentence.lower() for modal in self.modal_words)

        # Check for action verbs
        tokens = word_tokenize(sentence.lower())
        pos_tags = pos_tag(tokens)
        verbs = [word for word, tag in pos_tags if tag.startswith('VB')]
        has_action_verb = any(self.lemmatizer.lemmatize(v) in self.action_verbs for v in verbs)

        # Check for future tense or imperative mood
        has_future = any(word in sentence.lower() for word in ['will', 'going to'])

        # Check for imperative mood (sentence starting with verb)
        starts_with_verb = len(pos_tags) > 0 and pos_tags[0][1].startswith('VB')

        return has_modal or has_action_verb or has_future or starts_with_verb

    def extract_entity(self, sentence):
        doc = self.nlp(sentence)

        # Look for named entities
        people = [ent.text for ent in doc.ents if ent.label_ in ['PERSON', 'ORG']]

        # Look for subject pronouns if no named entities found
        if not people:
            for token in doc:
                if token.dep_ == 'nsubj':
                    return token.text

        return people[0] if people else None

    def extract_deadline(self, sentence):
        doc = self.nlp(sentence)

        # Time patterns
        time_patterns = [
            r'by\s+(.*?)(?=\.|$)',
            r'due\s+(.*?)(?=\.|$)',
            r'before\s+(.*?)(?=\.|$)',
            r'(?:today|tomorrow|tonight)',
            r'(?:\d{1,2}(?::\d{2})?\s*(?:am|pm))',
            r'(?:\d{1,2}/\d{1,2}/\d{2,4})',
            r'next\s+(?:monday|tuesday|wednesday|thursday|friday|saturday|sunday)',
            r'(?:this|next)\s+(?:week|month|year)'
        ]

        for pattern in time_patterns:
            match = re.search(pattern, sentence.lower())
            if match:
                return match.group()

        return None

    def categorize_task(self, task):
        categories = {
            'PURCHASE': ['buy', 'purchase', 'order', 'shop'],
            'MAINTENANCE': ['clean', 'fix', 'repair', 'maintain'],
            'COMMUNICATION': ['send', 'email', 'call', 'contact', 'inform'],
            'DEVELOPMENT': ['develop', 'code', 'program', 'implement'],
            'PLANNING': ['schedule', 'plan', 'organize', 'arrange'],
            'REVIEW': ['review', 'check', 'analyze', 'evaluate'],
            'DOCUMENTATION': ['document', 'write', 'report', 'draft'],
            'MEETING': ['meet', 'discuss', 'conference', 'sync']
        }

        task_lower = task.lower()
        for category, keywords in categories.items():
            if any(keyword in task_lower for keyword in keywords):
                return category

        return 'GENERAL'

    def extract_tasks(self, text):
        # Preprocess text
        cleaned_text = self.preprocess_text(text)

        # Extract sentences using spaCy
        sentences = self.extract_sentences(cleaned_text)

        tasks = []
        for sentence in sentences:
            if self.is_task_sentence(sentence):
                task_info = {
                    'task': sentence.strip(),
                    'assignee': self.extract_entity(sentence),
                    'deadline': self.extract_deadline(sentence),
                    'category': self.categorize_task(sentence)
                }
                tasks.append(task_info)

        return tasks

# Test the implementation
def main():
    # Create test text
    test_text = """
    Rahul wakes up early every day. He goes to college in the morning and comes back at 3 pm.
    At present, Rahul is outside. He has to buy the snacks for all of us.
    Sarah must complete the project report by Friday.
    The team needs to review the code before deployment tomorrow.
    Please clean the meeting room by 5 pm today.
    John will send the presentation to the client next week.
    Schedule a meeting with the development team for Monday.
    """

    # Initialize the task extractor
    extractor = TaskExtractor()

    # Extract tasks
    tasks = extractor.extract_tasks(test_text)

    # Print results
    print("\nExtracted Tasks:")
    print("-" * 50)
    for i, task in enumerate(tasks, 1):
        print(f"\nTask {i}:")
        print(f"Description: {task['task']}")
        print(f"Assignee: {task['assignee'] or 'Not specified'}")
        print(f"Deadline: {task['deadline'] or 'Not specified'}")
        print(f"Category: {task['category']}")

if __name__ == "__main__":
    main()

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Downloading NLTK resources...
NLTK resources downloaded successfully!
Downloading spaCy model...
Collecting en-core-web-sm==3.7.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
spaCy model downloaded successfully!

Extracted Tasks:
--------------------------------------------------

Task 1:
Description: he has to buy the snacks for all of us.
Assignee: he
Deadline: Not specified
Category: PURCHASE

Task 2:
Description: sarah must complete the project report by friday.
Assignee: sarah
Deadline: by friday
Category: DOCUMENTATION

Task 3: