# üöÄ Direct Upload Ingestion - GraphRecall

Use this notebook if you already have the processed `.zip` file from a previous run and just need to send it to the backend (e.g., after a runtime disconnect during ingestion).

### **Steps:**
1.  **Configure API Keys:** Set your backend URL and User ID.
2.  **Upload ZIP:** Upload the `processed_book.zip` file directly to this runtime.
3.  **Ingest:** The script will unzip it and send the content to your backend.

In [None]:
# @title 1. Configuration & Setup
import os
import time
import requests
import zipfile
from google.colab import files

# --- CONFIGURATION ---
BACKEND_URL = "https://graphrecall-backend.onrender.com" # @param {type:"string"}
USER_ID = "default_user" # @param {type:"string"}
ACCESS_TOKEN = "" # @param {type:"string"}

print(f"‚úÖ Configured for backend: {BACKEND_URL}")

In [None]:
# @title 2. Upload Processed ZIP File
print("Please upload your 'processed_book.zip' file...")
uploaded = files.upload()
zip_filename = next(iter(uploaded))
print(f"‚úÖ Uploaded: {zip_filename}")

# Unzip
extract_path = "./content/processed_book"
os.makedirs(extract_path, exist_ok=True)
with zipfile.ZipFile(zip_filename, "r") as zip_ref:
    members = [name for name in zip_ref.namelist() if not name.endswith("/")]
    zip_ref.extractall(extract_path)

print(f"‚úÖ Extracted {len(members)} files to: {extract_path}")

In [None]:
# @title 3. Send to Backend (Batched Ingestion)

import json
import base64
import glob
from pathlib import Path

extract_root = Path(extract_path)

# Load content (supports nested folders inside the ZIP)
md_candidates = sorted(extract_root.rglob("full_text.md"))
if not md_candidates:
    md_candidates = sorted(extract_root.rglob("*.md"))

if not md_candidates:
    print("‚ùå Error: No markdown file found in extracted zip!")
    print("Extracted files preview:")
    for p in sorted(extract_root.rglob("*"))[:50]:
        if p.is_file():
            print(f" - {p}")
    raise FileNotFoundError("No markdown file found under extracted path")

md_path = md_candidates[0]
with open(md_path, "r", encoding="utf-8") as f:
    content = f.read()
print(f"üìÑ Loaded markdown from {md_path} ({len(content)} chars)")

# Helper to enforce base64 string format
def file_to_base64(path):
    with open(path, "rb") as f:
        return base64.b64encode(f.read()).decode('utf-8')

# Find all images in the extracted folder (including nested folders)
image_map = {}
image_paths = []
for ext in ("png", "jpg", "jpeg"):
    image_paths.extend(glob.glob(f"{extract_path}/**/*.{ext}", recursive=True))

print(f"üì∑ Found {len(image_paths)} images to process...")

# Convert images to base64 map
for path in image_paths:
    image_name = os.path.basename(path)
    image_map[image_name] = file_to_base64(path)

print(f"‚úÖ Processed {len(image_map)} images.")

# Prepare payload
payload = {
    "content": content,
    "images": image_map,
    "title": os.path.splitext(os.path.basename(zip_filename))[0]
}

# Send request with longer timeout
print(f"üöÄ Sending ingestion request to {BACKEND_URL}/api/v2/ingest...")
start_time = time.time()

try:
    headers = {
        "Authorization": f"Bearer {ACCESS_TOKEN}" if ACCESS_TOKEN else None,
        "Content-Type": "application/json"
    }
    # Remove None headers
    headers = {k: v for k, v in headers.items() if v}
    
    response = requests.post(
        f"{BACKEND_URL}/api/v2/ingest",
        json=payload,
        timeout=600,  # 10 minute timeout for large batches
        headers=headers
    )
    
    response.raise_for_status()
    result = response.json()
    
    elapsed = time.time() - start_time
    thread_id = result.get('thread_id')
    print(f"\n‚úÖ Ingestion Started! (Took {elapsed:.2f}s to submit)")
    print(f"Thread ID: {thread_id}")
    print(f"Note ID: {result.get('note_id')}")
    print(f"Status: {result.get('status')}")
    print(f"Concepts: {len(result.get('concept_ids', []))}")
    print(f"Flashcards: {len(result.get('flashcard_ids', []))}")
    if thread_id:
        print(f"Status Check URL: {BACKEND_URL}/api/v2/ingest/{thread_id}/status")
    
except requests.exceptions.Timeout:
    print("\n‚ö†Ô∏è Request Timed Out (Client Side)")
    print("The backend is likely still processing your request. Check the logs on your backend dashboard.")
except requests.exceptions.RequestException as e:
    print(f"\n‚ùå Request Failed: {e}")
    if hasattr(e, 'response') and e.response:
        print(f"Status Code: {e.response.status_code}")
        print(f"Response: {e.response.text}")