PART A: Extracting Tasks from Unstructured Text Data

In [23]:
%pip install nltk spacy pandas
!python -m spacy download en_core_web_sm

Note: you may need to restart the kernel to use updated packages.
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     -- ------------------------------------- 0.8/12.8 MB 3.4 MB/s eta 0:00:04
     --- ------------------------------------ 1.0/12.8 MB 3.6 MB/s eta 0:00:04
     ------ --------------------------------- 2.1/12.8 MB 3.1 MB/s eta 0:00:04
     --------- ------------------------------ 3.1/12.8 MB 3.5 MB/s eta 0:00:03
     ----------- ---------------------------- 3.7/12.8 MB 3.5 MB/s eta 0:00:03
     -------------- ------------------------- 4.7/12.8 MB 3.6 MB/s eta 0:00:03
     ------------------ --------------------- 6.0/12.8 MB 4.0 MB/s eta 0:00:02
     ---------------------- ----------------- 7.3/12.8 MB 4.2

In [49]:
import re
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
import gensim
from gensim import corpora, models

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sushi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sushi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [71]:
# Preprocessing (keep original case for entity recognition)
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    words = [word for word in text.split() if word.lower() not in stop_words]
    return ' '.join(words)

# Task Extraction
def extract_tasks(text):
    sentences = sent_tokenize(text)
    task_sentences = []
    
    for sent in sentences:
        doc = nlp(sent)
        has_modal = any(token.tag_ == 'MD' for token in doc)  # Check for modal verbs (e.g., "should")
        has_action_verb = any(token.pos_ == 'VERB' and token.dep_ == 'ROOT' for token in doc)
        has_deadline = re.search(r'by\s((?:\d{1,2}\s?(?:am|pm)?\s)?(?:today|tomorrow))', sent, re.IGNORECASE)
        
        if (has_modal and has_action_verb) or has_deadline:
            task_sentences.append(sent)
    
    return task_sentences

# Entity & Deadline Extraction
def extract_entity_deadline(task_sentence):
    doc = nlp(task_sentence)
    
    # Extract entity using NER or dependency parsing
    entity = [ent.text for ent in doc.ents if ent.label_ == 'PERSON']
    if not entity:
        # Fallback: Extract the subject of the sentence
        entity = [token.text for token in doc if token.dep_ in ('nsubj', 'nsubjpass')]
    
    # Extract deadline using regex
    deadline_match = re.search(r'by\s((?:\d{1,2}\s?(?:am|pm)?\s)?(?:today|tomorrow))', task_sentence, re.IGNORECASE)
    deadline = deadline_match.group(1).strip() if deadline_match else None
    
    return entity[0] if entity else None, deadline

# Categorization
def categorize_tasks(task_sentences):
    # Preprocess without lowercasing to preserve entity names
    texts = [
        [word for word in preprocess_text(sent).split() 
         if word.lower() not in stop_words and len(word) > 2]
        for sent in task_sentences
    ]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    # Use LDA with fewer topics and more passes
    lda_model = models.LdaModel(corpus, num_topics=2, id2word=dictionary, passes=20)
    categories = {}
    for idx, topic in lda_model.print_topics():
        top_words = [word.split("*")[1].strip('"') for word in topic.split(" + ")]
        categories[f"Category_{idx}"] = top_words
    
    return categories


In [73]:
# Example Usage with More Tasks
input_text = """
Rahul should clean the room by 5 pm today.
John needs to submit the report by tomorrow.
Please review the document by 10 am today.
The team must finalize the proposal by 3 pm today.
Sarah has to prepare the presentation slides.
Alex needs to email the client by 10 am tomorrow.
"""

tasks = extract_tasks(input_text)
structured_tasks = []
for task in tasks:
    entity, deadline = extract_entity_deadline(task)
    structured_tasks.append({
        "task": task.strip(),
        "entity": entity,
        "deadline": deadline
    })

categories = categorize_tasks(tasks)
print("Structured Tasks:", structured_tasks)
print("Task Categories:", categories)

Structured Tasks: [{'task': 'Rahul should clean the room by 5 pm today.', 'entity': 'Rahul', 'deadline': '5 pm today'}, {'task': 'John needs to submit the report by tomorrow.', 'entity': 'John', 'deadline': 'tomorrow'}, {'task': 'Please review the document by 10 am today.', 'entity': None, 'deadline': '10 am today'}, {'task': 'The team must finalize the proposal by 3 pm today.', 'entity': 'team', 'deadline': '3 pm today'}, {'task': 'Alex needs to email the client by 10 am tomorrow.', 'entity': 'Alex', 'deadline': '10 am tomorrow'}]
Task Categories: {'Category_0': ['needs', 'tomorrow', 'report', 'John', 'Alex', 'submit', 'client', 'email', 'clean', 'room'], 'Category_1': ['today', 'proposal', 'must', 'finalize', 'team', 'document', 'review', 'Please', 'Rahul', 'room']}


Part B: Customer Review Classification

In [27]:
import os

def load_reviews(directory):
    reviews = []
    for filename in os.listdir(directory):
        with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
            reviews.append(file.read())
    return reviews

# Load positive and negative reviews
train_pos = load_reviews("C:\Users\sushi\OneDrive\Desktop\aclImdb\train\pos")
train_neg = load_reviews("C:\Users\sushi\OneDrive\Desktop\aclImdb\train\neg")
test_pos = load_reviews("C:\Users\sushi\OneDrive\Desktop\aclImdb\test\pos")
test_neg = load_reviews("C:\Users\sushi\OneDrive\Desktop\aclImdb\test\neg")

In [29]:
from nltk.corpus import stopwords
import re

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation and digits
    text = text.lower()  # Convert to lowercase
    words = [word for word in text.split() if word not in stopwords.words('english')]
    return ' '.join(words)

train_pos = [preprocess_text(review) for review in train_pos]
train_neg = [preprocess_text(review) for review in train_neg]

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train = vectorizer.fit_transform(train_pos + train_neg)
y_train = [1] * len(train_pos) + [0] * len(train_neg)  # 1 for positive, 0 for negative

In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)

In [43]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(f"Precision: {precision_score(y_test, y_pred):.2f}")
print(f"Recall: {recall_score(y_test, y_pred):.2f}")

Accuracy: 0.88
Precision: 0.88
Recall: 0.89


In [45]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

model = VotingClassifier(estimators=[
    ('lr', LogisticRegression()),
    ('svc', SVC())
], voting='hard')
model.fit(X_train, y_train)

In [47]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(f"Precision: {precision_score(y_test, y_pred):.2f}")
print(f"Recall: {recall_score(y_test, y_pred):.2f}")

Accuracy: 0.88
Precision: 0.88
Recall: 0.88
