# Notebook Testing Harness

This notebook executes all Jupyter notebooks in the project recursively and reports their completion status.

It serves as a testing harness to verify that all notebooks run successfully without errors.

In [1]:
import os
import glob
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor
from datetime import datetime
import sys
import traceback
from pathlib import Path
import json
from tqdm.notebook import tqdm

## Configuration

In [2]:
KERNEL_NAME = "python3"
KERNEL_OVERRIDES = {"1.open-information-extraction.ipynb": "ch5-spacy"}
EXCLUDE_DIRS = [".ipynb_checkpoints", "__pycache__"]
EXPORT_RESULTS = True
RESULTS_FILE = ""

## Utility Functions

In [3]:
def format_time(seconds):
    if seconds < 60:
        formatted = f"{seconds:.2f}s"
    elif seconds < 3600:
        formatted =  f"{seconds / 60:.2f}m"
    else:
        formatted =  f"{seconds / 3600:.2f}h"
    return formatted

def export_results(results, filename="test_results.json"):
    if results:
        with open(filename, 'w') as f:
            json.dump({"timestamp": datetime.now().isoformat(),
                       "results": results["details"],
                       "summary": results["summary"]}, f, indent=2)
        print(f"📄 Results exported to {filename}")

In [4]:
def get_notebook_files(root_dir=".", exclude_dirs=None, excluded_notebooks=None):
    """
    Recursively get all .ipynb files in the specified directory and its subdirectories,
    excluding specific directories and notebooks.
    
    Args:
        root_dir (str): Root directory to search from
        exclude_dirs (list): List of directory names to exclude
        exclude_notebooks (list): List of notebook filenames to exclude
        
    Returns:
        list: Sorted list of notebook file paths
    """
    if exclude_dirs is None:
        exclude_dirs = []
    
    if excluded_notebooks is None:
        excluded_notebooks = ["bonus.related-terms-from-documents.ipynb"
                              "bonus.phrase-detection.ipynb",
                              "bonus.phrase-detection.ipynb",
                              "bonus.related-terms-from-documents.ipynb",
                              "a.defunct.synthesize-search-sessions.ipynb",
                              "a.synthesize-search-sessions.ipynb",
                              "a.generate-movie-embeddings.ipynb",
                              "welcome.ipynb",
                              "playground_tues.ipynb",
                              "aips-test-suite.ipynb"]
        
    notebook_files = []
    
    for directory in os.walk(root_dir):
        for path in Path(directory[0]).rglob("*.ipynb"):
            # Skip excluded directories
            if any(exclude_dir in str(path) for exclude_dir in exclude_dirs):
                continue
            if path.name in excluded_notebooks:
                print(f"Skipping {path.name}")
                continue
            if str(path) not in notebook_files:
                notebook_files.append(str(path))
    
    notebook_files = sorted(notebook_files)
    return list(map(Path, notebook_files))

In [5]:
def execute_notebook(notebook_path, timeout=600, kernel_name="python3"):
    """
    Execute a notebook and return success status, execution time, and any error message.
    
    Args:
        notebook_path (str): Path to the notebook file
        timeout (int): Execution timeout in seconds
        kernel_name (str): Name of the kernel to use
        
    Returns:
        tuple: (success, execution_time, error)
    """
    start_time = datetime.now()
    
    try:
        # Read the notebook
        with open(notebook_path, 'r', encoding='utf-8') as f:
            nb = nbformat.read(f, as_version=4)
        
        # Create executor
        ep = ExecutePreprocessor(timeout=timeout, kernel_name=kernel_name, )
        
        # Execute the notebook
        ep.preprocess(nb, {'metadata': {'path': os.path.dirname(notebook_path)}})
        
        execution_time = (datetime.now() - start_time).total_seconds()
        return True, execution_time, None
    
    except Exception as e:
        execution_time = (datetime.now() - start_time).total_seconds()
        error_type = type(e).__name__
        error_msg = str(e)
        
        # Get traceback for detailed error information
        tb = traceback.format_exc()
        
        return False, execution_time, {
            'type': error_type,
            'message': error_msg,
            'traceback': tb
        }

## Main Testing Function

In [6]:
def run_test_harness(exclude_dirs=None, exclude_notebooks=None, stop_on_failure=False, chapter_to_run=None):
    """
    Run the testing harness on all notebooks recursively.
    
    Args:
        root_dir (str): Root directory to search from
        timeout (int): Execution timeout in seconds
        kernel_name (str): Name of the kernel to use
        exclude_dirs (list): List of directory names to exclude
        exclude_notebooks (list): List of notebook filenames to exclude
        
    Returns:
        dict: Test results
    """
    root_dir = "."
    timeout = 600 
    print(f"🔍 Notebook Testing Harness - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"📁 Testing from root directory: {os.path.abspath(root_dir)}")
    print("=" * 80)

    notebook_files = get_notebook_files(root_dir, exclude_dirs, exclude_notebooks)

    if chapter_to_run:        
        print(chapter_to_run)
        print(str(notebook_files[0]))
        notebook_files = [f for f in notebook_files if (chapter_to_run in str(f))]

    print(f"📋 Found {len(notebook_files)} notebook(s) to test.")
    for nb_file in notebook_files:
        # Make paths relative to root_dir for cleaner display
        rel_path = os.path.relpath(str(nb_file), root_dir)
        print(f"   • {rel_path}")
    print()
    
    # Results tracking
    results = {
        'details': [],
        'summary': {
            'total': len(notebook_files),
            'successful': 0,
            'failed': 0,
            'total_time': 0
        }
    }
    
    # Execute each notebook with progress bar
    print("🚀 Executing notebooks:")
    for notebook_path in tqdm(notebook_files, desc="Progress", colour="purple"):
        rel_path = os.path.relpath(str(notebook_path), root_dir)
        print(f"\n📔 Testing: {rel_path}")
        if "checkpoint" in str(notebook_path):
            print(f"\n📔 Skipping: {rel_path}")
            continue
        kernel_name = KERNEL_OVERRIDES.get(notebook_path.name, "python3")
        success, execution_time, error = execute_notebook(str(notebook_path), timeout, kernel_name)
        
        results['summary']['total_time'] += execution_time
        
        if success:
            print(f"   ✅ SUCCESS - Completed in {format_time(execution_time)}")
            results['summary']['successful'] += 1
        else:
            print(f"   ❌ FAILED - Error after {format_time(execution_time)}")
            print(f"      Error: {error['type']}: {error['message']}")
            results['summary']['failed'] += 1
        
        results['details'].append({
            'notebook': rel_path,
            'success': success,
            'execution_time': execution_time,
            'error': error
        })
        if not success and stop_on_failure:
            print("Terminating test run due to test failure.")
            break
    
    # Print summary
    print("\n" + "=" * 80)
    print(f"📊 SUMMARY:")
    print(f"   Total notebooks tested: {results['summary']['total']}")
    print(f"   ✅ Successful: {results['summary']['successful']}")
    print(f"   ❌ Failed: {results['summary']['failed']}")
    
    success_rate = (results['summary']['successful'] / results['summary']['total']) * 100
    print(f"   Success rate: {success_rate:.1f}%")
    print(f"   Total execution time: {format_time(results['summary']['total_time'])}")
    
    if results['summary']['failed'] > 0:
        print(f"\n❌ FAILED NOTEBOOKS:")
        for result in results['details']:
            if not result['success']:
                print(f"   • {result['notebook']}: {result['error']['type']}: {result['error']['message']}")
    
    print(f"\n🏁 Testing completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    
    if EXPORT_RESULTS and results:
        export_results(results, "AIPS_test_resulsts.json")
    
    return results

## Run the Testing Harness

In [7]:
results = run_test_harness(exclude_dirs=EXCLUDE_DIRS, stop_on_failure=True, chapter_to_run="ch06")

🔍 Notebook Testing Harness - 2025-07-21 16:24:53
📁 Testing from root directory: /home/jovyan
Skipping playground_tues.ipynb
Skipping welcome.ipynb
Skipping a.generate-movie-embeddings.ipynb
Skipping bonus.related-terms-from-documents.ipynb
Skipping bonus.phrase-detection.ipynb
Skipping a.synthesize-search-sessions.ipynb
Skipping a.defunct.synthesize-search-sessions.ipynb
Skipping playground_tues.ipynb
Skipping playground_tues.ipynb
Skipping playground_tues.ipynb
Skipping welcome.ipynb
Skipping a.generate-movie-embeddings.ipynb
Skipping bonus.related-terms-from-documents.ipynb
Skipping bonus.phrase-detection.ipynb
Skipping a.synthesize-search-sessions.ipynb
Skipping a.defunct.synthesize-search-sessions.ipynb
Skipping a.generate-movie-embeddings.ipynb
Skipping bonus.related-terms-from-documents.ipynb
Skipping bonus.phrase-detection.ipynb
Skipping a.synthesize-search-sessions.ipynb
Skipping a.defunct.synthesize-search-sessions.ipynb
ch06
chapters/ch03/1.vectors-and-text-similarity.ipynb
📋

Progress:   0%|          | 0/3 [00:00<?, ?it/s]


📔 Testing: chapters/ch06/1.skg-classification-disambiguation.ipynb
   ✅ SUCCESS - Completed in 11.20s

📔 Testing: chapters/ch06/2.related-keywords-from-signals.ipynb
   ❌ FAILED - Error after 1.55m
      Error: CellExecutionError: An error occurred while executing the following cell:
------------------
query = """
SELECT k1.keyword AS k1, k2.keyword AS k2, SUM(p1) n_users1, sum(p2) n_users2,
SUM(p1 + p2) AS users_cooc, COUNT(1) n_products FROM (
  SELECT keyword, product, COUNT(1) AS p1 FROM keyword_click_product
  GROUP BY keyword, product) AS k1 JOIN (
  SELECT keyword, product, COUNT(1) AS p2 FROM keyword_click_product
  GROUP BY keyword, product) AS k2 ON k1.product = k2.product
WHERE k1.keyword > k2.keyword GROUP BY k1.keyword, k2.keyword"""
spark.sql(query).createOrReplaceTempView("keyword_click_product_cooc")
print_keyword_pair_data()
------------------

[0;31m---------------------------------------------------------------------------[0m
[0;31mPy4JJavaError[0m              

## Export Results (Optional)