In [22]:
# --- Cell 1: Install dependencies (run once) ---
# Note: langgraph and crewai are evolving packages; if some versions conflict,
# restart the runtime and re-run.
!pip install -q crewai llama-cpp-python PyMuPDF langgraph gTTS


In [23]:
# --- Cell 2: Imports and Drive mount ---
import os
import json
import textwrap
from google.colab import files, drive
from IPython.display import Audio, display

# Optional libs that may be used later
import time
from typing import List

# Mount drive (you will be asked to authorize)
drive.mount('/content/drive', force_remount=False)

# Memory file path (update if you like)
MEMORY_FILE = "/content/drive/MyDrive/assistant_memory.json"


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
# Step 2: Download Mistral model directly to Drive (persistent, no need to re-upload)
!wget -O /content/drive/MyDrive/mistral-7b-instruct.Q4_K_M.gguf https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf


--2025-08-12 19:58:05--  https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf
Resolving huggingface.co (huggingface.co)... 13.35.202.40, 13.35.202.97, 13.35.202.121, ...
Connecting to huggingface.co (huggingface.co)|13.35.202.40|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs-us-1.hf.co/repos/72/62/726219e98582d16c24a66629a4dec1b0761b91c918e15dea2625b4293c134a92/3e0039fd0273fcbebb49228943b17831aadd55cbcbf56f0af00499be2040ccf9?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27mistral-7b-instruct-v0.2.Q4_K_M.gguf%3B+filename%3D%22mistral-7b-instruct-v0.2.Q4_K_M.gguf%22%3B&Expires=1755031621&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc1NTAzMTYyMX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzcyLzYyLzcyNjIxOWU5ODU4MmQxNmMyNGE2NjYyOWE0ZGVjMWIwNzYxYjkxYzkxOGUxNWRlYTI2MjViNDI5M2MxMzRhOTIvM2UwMDM5ZmQwMjczZmNiZWJiNDk

In [25]:
# --- Cell 3: Model init / LLM wrappers ---
# Update this to your model path in Drive if you have a .gguf (Mistral / Llama) file.
MODEL_PATH = "/content/drive/MyDrive/mistral-7b-instruct.Q4_K_M.gguf"

# We'll try to initialize llama-cpp-python if the model exists.
# If not, we fall back to a simple DummyLLM for pipeline testing.
llm_instance = None
use_dummy = False

try:
    from llama_cpp import Llama
    if os.path.exists(MODEL_PATH):
        print("✅ Model file found. Initializing local Llama model (llama-cpp-python).")
        llama_model = Llama(model_path=MODEL_PATH, n_ctx=4096, n_threads=8, use_mlock=False)
    else:
        print("⚠️ Model file not found at", MODEL_PATH)
        print("Falling back to DummyLLM. Put your .gguf in the MODEL_PATH to use the local model.")
        llama_model = None
        use_dummy = True
except Exception as e:
    print("⚠️ Couldn't import or init llama-cpp-python:", e)
    llama_model = None
    use_dummy = True

# CrewAI base LLM wrapper
from crewai.llms.base_llm import BaseLLM

class MyLocalLLM(BaseLLM):
    def __init__(self, llama_model, max_tokens=128):
        self._llama = llama_model
        self.max_tokens = 128

    def call(self, messages: list, **kwargs) -> str:
        # messages: list of {"role": "user"/"assistant", "content": "..."}
        prompt = self.convert_to_mistral_prompt(messages)
        # llama-cpp-python API: call model with prompt kwarg
        out = self._llama(prompt=prompt, max_tokens=self.max_tokens)
        # Depending on the model wrapper, keys may differ; handle safely:
        if isinstance(out, dict):
            # typical structure: {'choices': [{'text': '...'}], ...}
            c = out.get("choices")
            if c and isinstance(c, list) and "text" in c[0]:
                return c[0]["text"].strip()
            # other versions:
            return str(out)
        return str(out)

    def convert_to_mistral_prompt(self, messages: list) -> str:
        # basic conversion for instruction-style models
        prompt = ""
        for msg in messages:
            if msg.get("role") == "user":
                prompt += f"[INST] {msg['content']} [/INST]\n"
            elif msg.get("role") == "assistant":
                prompt += msg['content'] + "\n"
        return prompt

    def supports_stop_words(self) -> bool:
        return False

# Dummy LLM used for testing without model file
class DummyLLM(BaseLLM):
    def call(self, messages: list, **kwargs) -> str:
        # Very simple deterministic "mock" responses for testing pipeline.
        joined = " ".join(m["content"] for m in messages if m.get("role") == "user")
        if "Summarize:" in joined or joined.strip().lower().startswith("summarize"):
            # return first 120 chars as a "summary"
            summary = joined.replace("Summarize:", "").strip()
            return (summary[:240] + "...") if len(summary) > 240 else summary
        if "extract actionable tasks" in joined.lower() or "extract actionable" in joined.lower():
            # create fake tasks by splitting on sentences
            sentences = [s.strip() for s in joined.split(".") if s.strip()]
            bullets = []
            for i, s in enumerate(sentences[:6], start=1):
                bullets.append(f"- Task {i}: {s[:120].strip()}")
            return "\n".join(bullets) if bullets else "No tasks found."
        # fallback
        return "ACK: " + (joined[:200] + "...") if len(joined) > 200 else joined

# Choose actual LLM wrapper
if llama_model is not None:
    llm = MyLocalLLM(llama_model)
else:
    llm = DummyLLM()
    use_dummy = True

print("✅ LLM wrapper ready.", "Using DummyLLM?" , use_dummy)


llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from /content/drive/MyDrive/mistral-7b-instruct.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096


✅ Model file found. Initializing local Llama model (llama-cpp-python).


llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 32
llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 8
llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
llama_model_loader: - kv  11:                          general.file_type u32              = 15
llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32000]   = ["<unk>", "<s>", "</s>", "<0x

✅ LLM wrapper ready. Using DummyLLM? False


CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
Model metadata: {'tokenizer.chat_template': "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", 'tokenizer.ggml.add_eos_token': 'false', 'tokenizer.ggml.padding_token_id': '0', 'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.eos_token_id': '2', 'general.architecture': 'llama', 'llama.rope.freq_base': '1000000.000000', 'llama.context_length': '32768', 'general.name': 'mistralai_mistral-7b-instruct-v0.2', 'tokenizer.ggml.add_bos_token': 'true'

In [26]:
# --- Cell 4: PDF upload & extraction helpers ---
import fitz  # PyMuPDF

def upload_pdf_and_get_filename():
    print("Please upload one or more PDF files (choose in the file dialog).")
    uploaded = files.upload()
    filenames = list(uploaded.keys())
    if len(filenames) == 0:
        raise RuntimeError("No file uploaded.")
    # return the first file name for simplicity; you can extend for many files
    return filenames

def extract_pdf_text(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page_num, page in enumerate(doc, start=1):
            page_text = page.get_text()
            text += f"\n--- Page {page_num} ---\n{page_text}"
        return text.strip()
    except Exception as e:
        print("❌ Error reading PDF:", e)
        return ""

def chunk_text(text, max_chars=3000):
    # Use textwrap.wrap to avoid splitting words mid-word
    return textwrap.wrap(text, max_chars, break_long_words=False, break_on_hyphens=False)


In [27]:
# --- Cell 5: Setup persistent memory helpers ---
# Load or initialize the assistant memory JSON
if os.path.exists(MEMORY_FILE):
    with open(MEMORY_FILE, 'r') as f:
        assistant_memory = json.load(f)
        print("✅ Loaded existing memory with", len(assistant_memory.get("tasks", [])), "task blocks.")
else:
    assistant_memory = {"tasks": []}
    print("✅ Initialized fresh memory.")

def save_memory():
    with open(MEMORY_FILE, 'w') as f:
        json.dump(assistant_memory, f, indent=2)
    print("✅ Memory saved to", MEMORY_FILE)


✅ Loaded existing memory with 56 task blocks.


In [28]:
# --- Cell 6: LangGraph flow (summarize -> extract tasks) ---
# LangGraph nodes now require a schema for the state.

from langgraph.graph import StateGraph, START, END
from typing import TypedDict

# Define schema for the data passed between nodes
class WorkflowState(TypedDict):
    chunk_text: str
    summary: str
    tasks_raw: str

# --- Node 1: Summarization ---
def node_summarize(state: WorkflowState) -> dict:
    chunk = state.get("chunk_text", "")
    prompt = f"Summarize the following document section in a concise paragraph (1-3 sentences):\n\n{chunk}"
    out = llm.call([{"role": "user", "content": prompt}])
    return {"summary": out}

# --- Node 2: Task Extraction ---
def node_extract_tasks(state: WorkflowState) -> dict:
    summary = state.get("summary", "")
    prompt = (
        "From the summary below, extract actionable tasks, responsibilities, deadlines (explicit or implicit), "
        "and any follow-up actions. Output as a JSON array of objects with fields: "
        "task, responsibility, deadline, follow_up.\n\n"
        f"Summary:\n{summary}\n\n"
        "If no deadline is mentioned, set deadline to null."
    )
    out = llm.call([{"role": "user", "content": prompt}])
    return {"tasks_raw": out}

# --- Build the StateGraph ---
graph = StateGraph(WorkflowState)
graph.add_node("summarize", node_summarize)
graph.add_node("extract_tasks", node_extract_tasks)

# Use START/END to define flow
graph.add_edge(START, "summarize")
graph.add_edge("summarize", "extract_tasks")
graph.add_edge("extract_tasks", END)

print("✅ LangGraph StateGraph built (summarize -> extract_tasks).")



✅ LangGraph StateGraph built (summarize -> extract_tasks).


In [29]:
# --- Cell 7: Process uploaded PDF(s) and add to memory via the graph ---
# Compile the graph first
app = graph.compile()

# Upload PDFs now (you can upload multiple)
filenames = upload_pdf_and_get_filename()
print("Uploaded files:", filenames)

for pdf_filename in filenames:
    print("\n🔎 Processing:", pdf_filename)
    text = extract_pdf_text(pdf_filename)
    if not text:
        print("No text extracted from", pdf_filename)
        continue

    chunks = chunk_text(text, max_chars=3000)
    print(f" → Extracted {len(chunks)} chunks (max 3000 chars each).")

    for i, chunk in enumerate(chunks):
        state = {
            "chunk_text": chunk,
            "summary": "",
            "tasks_raw": ""
        }
        # Run the compiled graph
        final_state = app.invoke(state)

        summary = final_state.get("summary", "").strip()
        tasks_raw = final_state.get("tasks_raw", "").strip()

        # Store to memory
        assistant_memory["tasks"].append({
            "source_file": pdf_filename,
            "chunk_index": i + 1,
            "summary": summary,
            "tasks_raw": tasks_raw,
            "timestamp": time.time()
        })
        print(f"  ▪ Chunk {i+1}: summary length {len(summary)} chars, raw tasks length {len(tasks_raw)} chars")

# Save memory to Drive
save_memory()


Please upload one or more PDF files (choose in the file dialog).


Saving Activity_ Order, Tasks & Milestones.pdf to Activity_ Order, Tasks & Milestones (2).pdf
Uploaded files: ['Activity_ Order, Tasks & Milestones (2).pdf']

🔎 Processing: Activity_ Order, Tasks & Milestones (2).pdf
 → Extracted 1 chunks (max 3000 chars each).


llama_perf_context_print:        load time =   64646.83 ms
llama_perf_context_print: prompt eval time =   64644.22 ms /   229 tokens (  282.29 ms per token,     3.54 tokens per second)
llama_perf_context_print:        eval time =   72702.60 ms /   127 runs   (  572.46 ms per token,     1.75 tokens per second)
llama_perf_context_print:       total time =  137426.29 ms /   356 tokens
llama_perf_context_print:    graphs reused =        122
Llama.generate: 4 prefix-match hit, remaining 198 prompt tokens to eval
llama_perf_context_print:        load time =   64646.83 ms
llama_perf_context_print: prompt eval time =   55490.15 ms /   198 tokens (  280.25 ms per token,     3.57 tokens per second)
llama_perf_context_print:        eval time =   70709.53 ms /   127 runs   (  556.77 ms per token,     1.80 tokens per second)
llama_perf_context_print:       total time =  126274.24 ms /   325 tokens
llama_perf_context_print:    graphs reused =        122


  ▪ Chunk 1: summary length 636 chars, raw tasks length 431 chars
✅ Memory saved to /content/drive/MyDrive/assistant_memory.json


In [30]:
# --- Cell 8: CrewAI Planner to consolidate / deduplicate tasks in memory ---
# This is a simple hybrid: we create a CrewAI "Planner" task that ingests assistant_memory and returns a consolidated list.
from crewai import Agent, Task, Crew

planner_agent = Agent(
    role="Planner",
    goal="Consolidate all extracted tasks and produce a deduplicated, prioritized task list with responsibilities and deadlines",
    backstory="An assistant planner that consumes extracted tasks and produces a clean master TODO list.",
    llm=llm
)

# Prepare the prompt: include last N memory items (or everything if small)
N = 50
items_to_include = assistant_memory.get("tasks", [])[-N:]
# Build a compact prompt with the summaries + raw tasks
prompt_parts = []
for m in items_to_include:
    prompt_parts.append(
        f"Source: {m.get('source_file')} | chunk {m.get('chunk_index')}\nSummary:\n{m.get('summary')}\nRawTasks:\n{m.get('tasks_raw')}\n---\n"
    )
big_prompt = (
    "You will consolidate the following extracted summaries and raw task outputs from a document ingestion pipeline. "
    "Produce:\n1) A deduplicated bullet list of actionable tasks (short sentences).\n2) For each task, guess a responsibility if mentioned.\n3) For each task, note a deadline if present or null otherwise.\n4) Provide a short rationale (1 sentence) for why tasks were merged (if duplicates were merged).\n\n"
    "Data:\n\n" + "\n".join(prompt_parts)
)

consolidation_task = Task(
    description=big_prompt,
    expected_output="A JSON array of objects: [{task:..., responsibility:..., deadline:..., note:...}, ...]",
    agent=planner_agent
)

crew = Crew(agents=[planner_agent], tasks=[consolidation_task])
print("⏳ Running Planner (may take time depending on model)...")
planner_output = crew.kickoff()

# Display / save the planner output
print("\n--- Planner Output ---\n")
print(planner_output)

# Save as a dedicated JSON file as well
CONSOLIDATED_FILE = "/content/drive/MyDrive/consolidated_tasks_output.json"
try:
    # If planner_output is a list-like or string, attempt to store
    with open(CONSOLIDATED_FILE, "w") as f:
        if isinstance(planner_output, list) or isinstance(planner_output, dict):
            json.dump(planner_output, f, indent=2)
        else:
            f.write(str(planner_output))
    print("✅ Consolidated output saved to", CONSOLIDATED_FILE)
except Exception as e:
    print("⚠️ Could not save consolidated output:", e)


⏳ Running Planner (may take time depending on model)...


Llama.generate: 4 prefix-match hit, remaining 1513 prompt tokens to eval
llama_perf_context_print:        load time =   64646.83 ms
llama_perf_context_print: prompt eval time =  454248.82 ms /  1513 tokens (  300.23 ms per token,     3.33 tokens per second)
llama_perf_context_print:        eval time =   78501.15 ms /   127 runs   (  618.12 ms per token,     1.62 tokens per second)
llama_perf_context_print:       total time =  532828.15 ms /  1640 tokens
llama_perf_context_print:    graphs reused =        122



--- Planner Output ---

Based on the provided data, there are no actionable tasks or tasks with responsibilities, deadlines, or rationale for merging duplicates present in the first 13 chunks. The last chunk, however, contains tasks related to implementing tablets at The Green Bottle Restaurant's South, West, and Downtown locations.

Here is the final answer:

[
  {
    "task": "Identify a vendor for tablets",
    "responsibility": "IT department",
    "deadline": "Explicit, not mentioned in summary",
    "
✅ Consolidated output saved to /content/drive/MyDrive/consolidated_tasks_output.json


In [31]:
# --- Cell 9: Optional - small parser to turn tasks_raw into a bullet list + TTS speak ---
# This is a light attempt at parsing simple bullet lists from tasks_raw entries.
def parse_tasks_raw_to_bullets(tasks_raw_text: str) -> List[str]:
    # naive parsing: split on newline, keep lines that look like bullets or sentences
    lines = [l.strip() for l in tasks_raw_text.splitlines() if l.strip()]
    bullets = []
    for l in lines:
        if l.startswith("-") or l.startswith("*") or l.lower().startswith("task"):
            bullets.append(l.lstrip("-* ").strip())
        else:
            # keep short sentences up to 200 chars
            if len(l) < 250:
                bullets.append(l)
    return bullets

# Show top 6 bullets from memory
sample_bullets = []
for item in assistant_memory.get("tasks", [])[-6:]:
    parsed = parse_tasks_raw_to_bullets(item.get("tasks_raw", ""))
    if parsed:
        sample_bullets.extend(parsed)

print("Sample parsed bullets from recent memory:")
for i, b in enumerate(sample_bullets[:12], start=1):
    print(f"{i}. {b}")

# Optional speak: combine consolidated planner output (string) into TTS
try:
    from gtts import gTTS
    text_to_speak = str(planner_output)[:4000]  # keep under gTTS limits
    tts = gTTS(text_to_speak)
    tts.save("planner_voice.mp3")
    display(Audio("planner_voice.mp3", autoplay=False))
    print("✅ Played planner summary via gTTS (planner_voice.mp3).")
except Exception as e:
    print("⚠️ Could not run TTS:", e)
    print("You can inspect planner_output above.")


Sample parsed bullets from recent memory:
1. [
2. {
3. "task": "Identify a vendor for tablets",
4. "responsibility": "IT department",
5. "deadline": "Explicit, not mentioned in summary",
6. "follow_up": "Once identified, start contract negotiation"
7. },
8. {
9. "task": "Create contract and statement of work",
10. "responsibility": "Legal department",
11. "deadline": "Explicit, not mentioned in summary",
12. "follow_up": "Review and finalize with IT department"


✅ Played planner summary via gTTS (planner_voice.mp3).


In [32]:
# --- Cell 1: Install dependencies (run once) ---
!pip install -q pandas ipywidgets python-dateutil
# optional (only if you want TTS or heavy LLM later)
!pip install -q gTTS

In [12]:
# --- Cell 2: Imports & Drive mount ---
import os, re, time, json
from datetime import datetime
from dateutil import parser as dateparser
from google.colab import files, drive
from IPython.display import display, Markdown, HTML
import pandas as pd
import ipywidgets as widgets

# Mount Drive
drive.mount('/content/drive', force_remount=False)

# Memory file path (feel free to change)
MEMORY_FILE = "/content/drive/MyDrive/assistant_memory.json"

# Ensure memory exists
if os.path.exists(MEMORY_FILE):
    with open(MEMORY_FILE, "r") as f:
        assistant_memory = json.load(f)
else:
    assistant_memory = {"tasks": []}
    with open(MEMORY_FILE, "w") as f:
        json.dump(assistant_memory, f, indent=2)
print("Memory path:", MEMORY_FILE)
print("Loaded tasks:", len(assistant_memory.get("tasks", [])))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Memory path: /content/drive/MyDrive/assistant_memory.json
Loaded tasks: 2


In [33]:
# --- Cell 3: Lightweight NLP helpers (heuristics + safe LLM fallback) ---
# This cell provides functions to parse and enrich tasks.
# They use heuristics (fast) and only call your LLM if available.

# Heuristic priority detection based on keywords
PRIORITY_KEYWORDS = {
    "high": ["urgent", "asap", "immediately", "today", "by end of day", "priority"],
    "medium": ["soon", "this week", "by friday", "by monday", "next week"],
    "low": ["later", "ongoing", "someday", "no rush"]
}

CATEGORY_KEYWORDS = {
    "Engineering": ["deploy", "release", "bug", "fix", "refactor", "api", "endpoint"],
    "Marketing": ["campaign", "launch", "press", "blog", "seo", "ad", "social"],
    "Finance": ["invoice", "budget", "payment", "pay", "transaction", "cost"],
    "Operations": ["onboard", "process", "sop", "schedule", "logistics"],
    "Legal": ["contract", "nda", "compliance", "terms", "policy"],
    "Personal": ["call", "birthday", "personal", "appointment"]
}

def detect_priority(text: str) -> str:
    t = text.lower()
    for p, kws in PRIORITY_KEYWORDS.items():
        for kw in kws:
            if kw in t:
                return p.capitalize()
    return "Medium"  # default

def detect_category(text: str) -> str:
    t = text.lower()
    counts = {}
    for cat, kws in CATEGORY_KEYWORDS.items():
        for kw in kws:
            if kw in t:
                counts[cat] = counts.get(cat, 0) + 1
    if counts:
        # return highest match
        return max(counts.items(), key=lambda x: x[1])[0]
    return "General"

# Deadline extraction: regex for dates and relative phrases
RELATIVE_KEYWORDS = ["today", "tomorrow", "by friday", "by monday", "next week", "this week", "end of day", "eod", "e.o.d"]

def extract_deadline(text: str):
    text_lower = text.lower()
    # direct date patterns (common formats)
    date_patterns = [
        r"\b(on\s)?(jan(uary)?|feb(ruary)?|mar(ch)?|apr(il)?|may|jun(e)?|jul(y)?|aug(ust)?|sep(t)?(ember)?|oct(ober)?|nov(ember)?|dec(ember)?)[\s\-\.]*\d{1,2}(st|nd|rd|th)?(?:,?\s*\d{4})?",
        r"\b\d{1,2}[ /.-]\d{1,2}[ /.-]\d{2,4}\b",  # 12/09/2023 or 12-09-2023
        r"\b\d{4}-\d{1,2}-\d{1,2}\b"  # 2023-09-12
    ]
    for pat in date_patterns:
        m = re.search(pat, text, flags=re.IGNORECASE)
        if m:
            try:
                dt = dateparser.parse(m.group(0), fuzzy=True, dayfirst=False)
                return dt.strftime("%Y-%m-%d")
            except Exception:
                return m.group(0)  # fallback to raw match

    # relative keywords
    for rk in RELATIVE_KEYWORDS:
        if rk in text_lower:
            # try crude mapping (you can improve)
            today = datetime.now()
            if rk == "today":
                return today.strftime("%Y-%m-%d")
            if rk == "tomorrow":
                return (today + pd.Timedelta(days=1)).strftime("%Y-%m-%d")
            if "next week" in rk or rk == "next week":
                return (today + pd.Timedelta(days=7)).strftime("%Y-%m-%d")
            # handle 'end of day' & weekdays heuristically
            if "end of day" in rk or "eod" in rk:
                return today.strftime("%Y-%m-%d")
            # 'by friday' approximate to next friday
            if "friday" in rk:
                # find next Friday
                offset = (4 - today.weekday()) % 7
                if offset == 0:
                    offset = 7
                return (today + pd.Timedelta(days=offset)).strftime("%Y-%m-%d")
    return None

# Normalize raw task text: remove garbage and short-circuit bullets
def normalize_task_text(raw: str) -> str:
    # remove extra whitespace and common prefixes
    s = raw.strip()
    s = re.sub(r"^\-+\s*", "", s)
    s = re.sub(r"\s+", " ", s)
    # short fallback
    return s[:1000]

# Function to enrich a single task dict
def enrich_task(raw_text: str):
    text = normalize_task_text(raw_text)
    priority = detect_priority(text)
    category = detect_category(text)
    deadline = extract_deadline(text)
    return {
        "task": text,
        "priority": priority,
        "category": category,
        "deadline": deadline,
        "status": "Todo",
        "created_at": time.time()
    }


In [34]:
# --- Cell 4: Ingest tasks from assistant_memory (or ad-hoc input) and enrich them ---
# You can either load tasks from previous memory (assistant_memory) or paste new 'tasks_raw' strings.

# Option A: Take tasks from assistant_memory (last N)
N = 50
loaded = assistant_memory.get("tasks", [])[-N:]

# Flatten tasks_raw -> list of candidate lines
candidates = []
for item in loaded:
    raw = item.get("tasks_raw", "") or item.get("summary", "")
    # split by newlines and bullets, fallback to the summary
    lines = [l.strip(" -•\t") for l in re.split(r"[\n\r]+", raw) if l.strip()]
    candidates.extend(lines if lines else [raw])

# Option B: (Uncomment to add ad-hoc manual input)
# candidates.append("Prepare invoice for client X by 15/09/2025 (urgent).")

print(f"Loaded {len(candidates)} candidate task lines from memory.")

# Enrich and dedupe
enriched = [enrich_task(c) for c in candidates if c.strip()]
# Simple dedupe by task text
seen = set()
unique_enriched = []
for e in enriched:
    key = e["task"].lower()
    if key not in seen:
        seen.add(key)
        unique_enriched.append(e)

df = pd.DataFrame(unique_enriched)
if df.empty:
    print("No tasks found in memory. Try processing PDFs first (Day 9).")
else:
    display(df.head(50))


Loaded 63 candidate task lines from memory.


Unnamed: 0,task,priority,category,deadline,status,created_at
0,[,Medium,General,,Todo,1755030000.0
1,{,Medium,General,,Todo,1755030000.0
2,"""task"": ""Identify a vendor for tablets"",",Medium,General,,Todo,1755030000.0
3,"""responsibility"": ""IT department"",",Medium,General,,Todo,1755030000.0
4,"""deadline"": ""Explicit, not mentioned in summary"",",Medium,Marketing,,Todo,1755030000.0
5,"""follow_up"": ""Once identified, start contract ...",Medium,Legal,,Todo,1755030000.0
6,"},",Medium,General,,Todo,1755030000.0
7,"""task"": ""Create contract and statement of work"",",Medium,Legal,,Todo,1755030000.0
8,"""responsibility"": ""Legal department"",",Medium,General,,Todo,1755030000.0
9,"""follow_up"": ""Review and finalize with IT depa...",Medium,General,,Todo,1755030000.0


In [35]:
# --- Cell 5: Interactive checklist UI (ipwidgets) and persistence ---
# This creates a simple table with checkboxes to mark tasks done and buttons to export.

# Create widgets for each row
rows = []
if not df.empty:
    for idx, row in df.iterrows():
        chk = widgets.Checkbox(value=False, description="", indent=False)
        lbl = widgets.HTML(f"<b>{row['task']}</b><br><small>Priority: {row['priority']} | Category: {row['category']} | Deadline: {row['deadline']}</small>")
        rows.append((idx, chk, lbl))

    # Display rows
    box_children = []
    for idx, chk, lbl in rows:
        box_children.append(widgets.HBox([chk, lbl], layout=widgets.Layout(width="100%")))
    tasks_box = widgets.VBox(box_children, layout=widgets.Layout(max_height="400px", overflow="auto", width="100%"))
    display(tasks_box)
else:
    display(Markdown("**No tasks available to display.**"))

# Buttons
save_btn = widgets.Button(description="Save Progress", button_style="success")
export_csv_btn = widgets.Button(description="Export CSV")
export_json_btn = widgets.Button(description="Export JSON")
export_md_btn = widgets.Button(description="Export Markdown")

out = widgets.Output()
display(widgets.HBox([save_btn, export_csv_btn, export_json_btn, export_md_btn]))
display(out)

def gather_state_from_widgets():
    results = []
    for i, chk, lbl in rows:
        task_row = df.loc[i].to_dict()
        task_row["status"] = "Done" if chk.value else "Todo"
        results.append(task_row)
    return results

def on_save_clicked(b):
    results = gather_state_from_widgets()
    # Append to assistant_memory and persist
    for r in results:
        assistant_memory["tasks"].append(r)
    with open(MEMORY_FILE, "w") as f:
        json.dump(assistant_memory, f, indent=2)
    with out:
        out.clear_output()
        print("Saved progress to", MEMORY_FILE)

def on_export_csv(b):
    results = gather_state_from_widgets()
    df_out = pd.DataFrame(results)
    path = "/content/actionboard_tasks_export.csv"
    df_out.to_csv(path, index=False)
    files.download(path)

def on_export_json(b):
    results = gather_state_from_widgets()
    path = "/content/actionboard_tasks_export.json"
    with open(path, "w") as f:
        json.dump(results, f, indent=2)
    files.download(path)

def on_export_md(b):
    results = gather_state_from_widgets()
    md_lines = ["# ActionBoard Tasks\n"]
    for i, r in enumerate(results, start=1):
        dl = r.get("deadline") or "No deadline"
        md_lines.append(f"## {i}. {r['task']}\n- Priority: {r['priority']}\n- Category: {r['category']}\n- Deadline: {dl}\n- Status: {r['status']}\n")
    path = "/content/actionboard_tasks_export.md"
    with open(path, "w") as f:
        f.write("\n".join(md_lines))
    files.download(path)

save_btn.on_click(on_save_clicked)
export_csv_btn.on_click(on_export_csv)
export_json_btn.on_click(on_export_json)
export_md_btn.on_click(on_export_md)


VBox(children=(HBox(children=(Checkbox(value=False, indent=False), HTML(value='<b>[</b><br><small>Priority: Me…

HBox(children=(Button(button_style='success', description='Save Progress', style=ButtonStyle()), Button(descri…

Output()

In [36]:
# --- Cell 6: Quick analytics & one-click improvements suggestions (product value adds) ---
# Show simple counts and suggestions to increase product value
if not df.empty:
    total = len(df)
    high = (df['priority'] == 'High').sum()
    medium = (df['priority'] == 'Medium').sum()
    low = (df['priority'] == 'Low').sum()
    cats = df['category'].value_counts().to_dict()

    display(Markdown(f"**Tasks:** {total}  |  High: {high}  |  Medium: {medium}  |  Low: {low}"))
    display(Markdown("**Top Categories:**"))
    for k,v in cats.items():
        display(Markdown(f"- {k}: {v}"))

    display(Markdown("### Product Improvement Suggestions (quick wins):"))
    display(Markdown(
        "- Add calendar export (.ics) so users can push tasks to Google Calendar.\n"
        "- Allow one-click export to Trello/Notion (paid connectors) — core monetization.\n"
        "- Add automated follow-up reminders for tasks with no deadline (e.g., suggest deadlines).\n"
        "- Provide a weekly email digest (free tier limited to 3 docs / month).\n"
    ))
else:
    display(Markdown("No data to analyze yet. Process PDFs to extract tasks first."))


**Tasks:** 10  |  High: 0  |  Medium: 10  |  Low: 0

**Top Categories:**

- General: 7

- Legal: 2

- Marketing: 1

### Product Improvement Suggestions (quick wins):

- Add calendar export (.ics) so users can push tasks to Google Calendar.
- Allow one-click export to Trello/Notion (paid connectors) — core monetization.
- Add automated follow-up reminders for tasks with no deadline (e.g., suggest deadlines).
- Provide a weekly email digest (free tier limited to 3 docs / month).


In [37]:
# --- Cell 7: Install additional dependencies (run once) ---
!pip install -q icalendar requests pygments


In [41]:
# ==== Cell 8: Export tasks to ICS, Trello, and Notion ====
!pip install ics

import json
from datetime import datetime, timedelta
from ics import Calendar, Event
import requests

# Example: your extracted tasks list
tasks = [
    {"task": "Review project plan", "due_date": "2025-08-15"},
    {"task": "Send report to client", "due_date": "2025-08-17"}
]

# ===== 1. Export to ICS file =====
cal = Calendar()
for task in tasks:
    e = Event()
    e.name = task["task"]
    if task.get("due_date"):
        e.begin = datetime.strptime(task["due_date"], "%Y-%m-%d")
        e.end = e.begin + timedelta(hours=1)
    cal.events.add(e)

ics_filename = "tasks_export.ics"
with open(ics_filename, "w") as f:
    f.writelines(cal)

print(f"ICS file saved as: {ics_filename}")

# Download link in Colab
from google.colab import files
files.download(ics_filename)

# ===== 2. Export to Trello =====
# Get your API key/token: https://trello.com/app-key
trello_key = "YOUR_TRELLO_API_KEY_HERE"
trello_token = "YOUR_TRELLO_TOKEN_HERE"
trello_list_id = "YOUR_TRELLO_LIST_ID_HERE"  # The list ID where cards will be added

for task in tasks:
    trello_url = f"https://api.trello.com/1/cards"
    trello_params = {
        "key": trello_key,
        "token": trello_token,
        "idList": trello_list_id,
        "name": task["task"],
        "due": task.get("due_date", "")
    }
    r = requests.post(trello_url, params=trello_params)
    if r.status_code == 200:
        print(f"✅ Trello card created: {task['task']}")
    else:
        print(f"❌ Failed Trello card: {task['task']} | {r.text}")

# ===== 3. Export to Notion =====
# Get your integration token: https://www.notion.so/my-integrations
notion_token = "YOUR_NOTION_TOKEN_HERE"
notion_database_id = "YOUR_NOTION_DATABASE_ID_HERE"

notion_headers = {
    "Authorization": f"Bearer {notion_token}",
    "Content-Type": "application/json",
    "Notion-Version": "2022-06-28"
}

for task in tasks:
    notion_payload = {
        "parent": {"database_id": notion_database_id},
        "properties": {
            "Name": {
                "title": [{"text": {"content": task["task"]}}]
            },
            "Due Date": {
                "date": {"start": task.get("due_date", datetime.now().strftime("%Y-%m-%d"))}
            }
        }
    }
    r = requests.post("https://api.notion.com/v1/pages", headers=notion_headers, json=notion_payload)
    if r.status_code == 200:
        print(f"✅ Notion page created: {task['task']}")
    else:
        print(f"❌ Failed Notion page: {task['task']} | {r.text}")


ICS file saved as: tasks_export.ics


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Trello card created: Review project plan
✅ Trello card created: Send report to client
❌ Failed Notion page: Review project plan | {"object":"error","status":404,"code":"object_not_found","message":"Could not find database with ID: f507c74f-4752-4feb-b35c-bb54dccbde34. Make sure the relevant pages and databases are shared with your integration.","request_id":"56bef5fc-2205-459c-b8c7-3a1a0b832b4f"}
❌ Failed Notion page: Send report to client | {"object":"error","status":404,"code":"object_not_found","message":"Could not find database with ID: f507c74f-4752-4feb-b35c-bb54dccbde34. Make sure the relevant pages and databases are shared with your integration.","request_id":"66e7803a-245d-408b-b053-5ee2a0db213d"}


In [42]:
# --- Cell 9: UI Buttons to run exports (ICS, Trello, Notion, Suggest deadlines, Weekly digest) ---
# This cell wires everything to simple interactive buttons.

from IPython.display import display
import ipywidgets as widgets

tasks_now = get_current_tasks_from_ui()

btn_ics = widgets.Button(description="Export .ics (Calendar)", button_style="info")
btn_trello = widgets.Button(description="Export to Trello", button_style="primary")
btn_notion = widgets.Button(description="Export to Notion", button_style="primary")
btn_suggest_dl = widgets.Button(description="Suggest Deadlines (for missing)", button_style="warning")
btn_weekly = widgets.Button(description="Generate Weekly Digest", button_style="success")

out2 = widgets.Output()
display(widgets.HBox([btn_ics, btn_suggest_dl, btn_trello, btn_notion, btn_weekly]))
display(out2)

def on_ics(b):
    with out2:
        out2.clear_output()
        tasks = get_current_tasks_from_ui()
        export_to_ics(tasks)
def on_suggest(b):
    with out2:
        out2.clear_output()
        tasks = get_current_tasks_from_ui()
        updated = suggest_deadlines(tasks, default_days=3)
        # Update UI-backed memory as well
        # Replace assistant_memory last N entries for simplicity
        # (safe persistence requires more careful merging)
        print("Suggested deadlines applied to tasks (3 days ahead where missing).")
        display(pd.DataFrame(updated)[:10])
def on_trello(b):
    with out2:
        out2.clear_output()
        tasks = get_current_tasks_from_ui()
        print("Trello export — you will be prompted for Trello key/token if not set.")
        res = export_to_trello(tasks)
        print(res)
def on_notion(b):
    with out2:
        out2.clear_output()
        tasks = get_current_tasks_from_ui()
        print("Notion export — you will be prompted for Notion token and parent id if not set.")
        res = export_to_notion(tasks)
        print(res)
def on_weekly(b):
    with out2:
        out2.clear_output()
        tasks = get_current_tasks_from_ui()
        text = generate_weekly_digest(tasks)
        print(text[:1000])
        print("\nIf you want to email this digest, call send_email_smtp(...) with your SMTP creds.")

btn_ics.on_click(on_ics)
btn_suggest_dl.on_click(on_suggest)
btn_trello.on_click(on_trello)
btn_notion.on_click(on_notion)
btn_weekly.on_click(on_weekly)


NameError: name 'get_current_tasks_from_ui' is not defined