Merge pull request #786 from chrispyles/nbconvert-webpdf

Convert PDF via HTML exporter to nbconvert WebPDF exporter
ucbds-infra · Mar 8, 2024 · 8f6d836 · 8f6d836
2 parents 122a6f6 + ecc23bb
commit 8f6d836
Show file tree

Hide file tree

Showing 14 changed files with 43 additions and 159 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,8 @@
 
 * Removed compatibility patches for nbconvert < 6 per [#777](https://github.com/ucbds-infra/otter-grader/issues/777)
 * Updated Otter Export to throw an error if nbconvert<6.0.0 is found
+* Converted Otter Export's PDF via HTML exporter to use nbconvert's WebPDF exporter per [#781](https://github.com/ucbds-infra/otter-grader/issues/781)
+* Removed pdfkit from dependencies
 
 **v5.5.0 (unreleased):**
 

diff --git a/docs/_static/grading-environment-r.yml b/docs/_static/grading-environment-r.yml
@@ -35,6 +35,5 @@ dependencies:
       - dill
       - numpy
       - gspread
-      - pypdf
       - otter-grader==5.4.1
       - rpy2
diff --git a/docs/index.rst b/docs/index.rst
@@ -69,9 +69,9 @@ Installation
 ------------
 
 Otter is a Python package that is compatible with Python 3.6+. The PDF export internals require 
-either LaTeX and Pandoc or wkhtmltopdf to be installed. Docker is also required to grade assignments 
-locally with containerization. Otter's Python package can be installed using pip. To install the
-current stable version, install with
+either LaTeX and Pandoc or Playwright and Chromium to be installed. Docker is also required to grade
+assignments locally with containerization. Otter's Python package can be installed using pip. To
+install the current stable version, install with
 
 .. code-block:: console
 

diff --git a/docs/pdfs.rst b/docs/pdfs.rst
@@ -8,8 +8,7 @@ filtering for generating PDFs for manual grading. There are two options for expo
 
 
 * **PDF via LaTeX:** this uses nbconvert, pandoc, and LaTeX to generate PDFs from TeX files
-* **PDF via HTML:** this uses wkhtmltopdf and the Python packages pdfkit and pypdf to generate PDFs 
-  from HTML files
+* **PDF via HTML:** this uses nbconvert's WebPDF exporter to generate PDFs from HTML
 
 Otter Export is used by Otter Assign to generate Gradescope PDF templates and solutions, in the 
 Gradescope autograder to generate the PDFs of notebooks, by ``otter.Notebook`` to generate PDFs and 

diff --git a/otter/assign/__init__.py b/otter/assign/__init__.py
@@ -8,7 +8,7 @@
 from .output import write_output_directories
 from .utils import run_tests, write_otter_config_file, run_generate_autograder
 
-from ..export import export_notebook, WkhtmltopdfNotFoundError
+from ..export import export_notebook
 from ..plugins import PluginCollection
 from ..utils import chdir, get_relpath, knit_rmd_file, loggers
 
@@ -90,26 +90,15 @@ def main(
 
             if not assignment.is_rmd:
                 LOGGER.debug(f"Exporting {src} as notebook to {dst}")
-                try:
-                    LOGGER.debug("Attempting PDF via HTML export")
-                    export_notebook(
-                        src,
-                        dest=dst,
-                        filtering=filtering,
-                        pagebreaks=filtering,
-                        exporter_type="html",
-                    )
-                    LOGGER.debug("PDF via HTML export successful")
-
-                except WkhtmltopdfNotFoundError:
-                    LOGGER.debug("PDF via HTML export failed; attempting PDF via LaTeX export")
-                    export_notebook(
-                        src,
-                        dest=dst,
-                        filtering=filtering,
-                        pagebreaks=filtering,
-                    )
-                    LOGGER.debug("PDF via LaTeX export successful")
+                LOGGER.debug("Attempting PDF via HTML export")
+                export_notebook(
+                    src,
+                    dest=dst,
+                    filtering=filtering,
+                    pagebreaks=filtering,
+                    exporter_type="html",
+                )
+                LOGGER.debug("PDF via HTML export successful")
 
             else:
                 LOGGER.debug(f"Knitting {src} to {dst}")

diff --git a/otter/export/__init__.py b/otter/export/__init__.py
@@ -2,8 +2,6 @@
 
 import os
 
-from .utils import WkhtmltopdfNotFoundError
-
 
 def export_notebook(nb_path, dest=None, exporter_type=None, **kwargs):
     """

diff --git a/otter/export/exporters/__init__.py b/otter/export/exporters/__init__.py
@@ -2,10 +2,9 @@
 
 import shutil
 
+from .via_html import PDFViaHTMLExporter
 from .via_latex import PDFViaLatexExporter
 
-from ..utils import WkhtmltopdfNotFoundError
-
 
 def get_exporter(exporter_type=None):
     """
@@ -18,9 +17,6 @@ def get_exporter(exporter_type=None):
 
     Returns:
         ``otter.export.exporters.base_exporter.BaseExporter``: the exporter class
-
-    Raises:
-        ``WkhtmltopdfNotFoundError``: if PDF via HTML is indicated but wkhtmltopdf is not installed.
     """
     # throw an error if the nbconvert version is < 6
     import nbconvert
@@ -31,10 +27,6 @@ def get_exporter(exporter_type=None):
         exporter_type = exporter_type.lower()
 
         if exporter_type == 'html':
-            if shutil.which("wkhtmltopdf") is None:
-                raise WkhtmltopdfNotFoundError("PDF via HTML indicated but wkhtmltopdf not found")
-
-            from .via_html import PDFViaHTMLExporter
             return PDFViaHTMLExporter
 
         elif exporter_type == "latex":

diff --git a/otter/export/exporters/utils.py b/otter/export/exporters/utils.py
@@ -1,10 +1,6 @@
 """Utilities for Otter Export exporters"""
 
 import re
-import copy
-import nbformat
-
-from ...utils import get_source
 
 
 BEGIN_QUESTION_REGEX = r"<!--\s*BEGIN QUESTION\s*-->"
@@ -73,45 +69,3 @@ def sub_end_for_new_page(line):
         ``str``: the line with the end question match substituted for the newpage comment
     """
     return re.sub(END_QUESTION_REGEX, NEW_PAGE_CELL_SOURCE, line)
-
-
-def notebook_pdf_generator(nb):
-    """
-    A generator that takes in a notebook ``nb`` with HTML comments for filtering and splits this
-    notebook up into each filtered block, yielding a complete notebook for each chunk. Used for 
-    implementing pagebreaks in PDFs via HTML.
-
-    Args:
-        nb (``nbformat.NotebookNode``): the notebook to be exported
-
-    Yields:
-        ``nbformat.NotebookNode``: a complete notebook containing a single filtered block
-    """
-    dummy_nb = copy.copy(nb)
-    dummy_nb.cells = []
-
-    all_cells, subnb_cells = [], []
-    for cell in nb.cells:
-        source = get_source(cell)
-
-        if NEW_PAGE_CELL_SOURCE in "\n".join(source):
-            for i, line in enumerate(source):
-                if NEW_PAGE_CELL_SOURCE in line:
-                    break
-
-            c1, c2 = nbformat.v4.new_markdown_cell(), nbformat.v4.new_markdown_cell()
-            c1.source, c2.source = "\n".join(source[:i+1]), "\n".join(source[i+1:])
-
-            subnb_cells.append(c1)
-            all_cells.append(subnb_cells)
-            subnb_cells = []
-            subnb_cells.append(c2)
-
-        else:
-            subnb_cells.append(cell)
-
-    all_cells.append(subnb_cells)
-
-    for subnb_cells in all_cells:
-        dummy_nb.cells = subnb_cells
-        yield dummy_nb
diff --git a/otter/export/exporters/via_html.py b/otter/export/exporters/via_html.py
@@ -2,21 +2,13 @@
 
 import nbconvert
 import os
-import shutil
-
-from io import BytesIO
 
 from .base_exporter import BaseExporter, TEMPLATE_DIR
-from .utils import notebook_pdf_generator
 
 
 class PDFViaHTMLExporter(BaseExporter):
     """
-    Exports notebooks to PDF files using HTML as an intermediary
-
-    Converts IPython notebooks to PDFs by first converting them into temporary HTML files that are then
-    converted to PDFs using wkhtmltopdf and its Python API pdfkit which are then stitched together (if
-    pagebreaks are enabled) using pypdf.
+    An exporter that uses nbconvert's WebPDF exporter to convert notebooks to PDFs via HTML.
 
     Attributes:
         default_options (``dict``): the default options for this exporter
@@ -30,12 +22,6 @@ class PDFViaHTMLExporter(BaseExporter):
 
     @classmethod
     def convert_notebook(cls, nb_path, dest, **kwargs):
-        if shutil.which("wkhtmltopdf") is None:
-            raise RuntimeError("Cannot export via HTML without wkhtmltopdf")
-
-        import pdfkit
-        from pypdf import PdfMerger
-
         options = cls.default_options.copy()
         options.update(kwargs)
 
@@ -45,32 +31,11 @@ def convert_notebook(cls, nb_path, dest, **kwargs):
         orig_template_name = nbconvert.TemplateExporter.template_name
         nbconvert.TemplateExporter.template_name = options["template"]
 
-        exporter = nbconvert.HTMLExporter()
-
-        if options["save_html"]:
-            html, _ = nbconvert.export(exporter, nb)
-            html_path = os.path.splitext(dest)[0] + ".html"
-            with open(html_path, "wb+") as f:
-                f.write(html.encode("utf-8"))
-
-        merger = PdfMerger()
-        for subnb in notebook_pdf_generator(nb):
-            html, _ = nbconvert.export(exporter, subnb)
-
-            pdfkit_options = {
-                'enable-local-file-access': None, 
-                'quiet': '', 
-                'print-media-type': '', 
-                'javascript-delay': 2000
-            }
-            pdf_contents = pdfkit.from_string(html, False, options=pdfkit_options)
-
-            output = BytesIO()
-            output.write(pdf_contents)
-            output.seek(0)
-
-            merger.append(output, import_outline=False)
+        exporter = nbconvert.WebPDFExporter()
 
-        merger.write(dest)
+        pdf, _ = nbconvert.export(exporter, nb)
+        pdf_path = os.path.splitext(dest)[0] + ".pdf"
+        with open(pdf_path, "wb+") as f:
+            f.write(pdf)
 
         nbconvert.TemplateExporter.template_name = orig_template_name
diff --git a/otter/export/exporters/via_latex.py b/otter/export/exporters/via_latex.py
@@ -8,17 +8,10 @@
 
 from .base_exporter import BaseExporter, ExportFailedException, TEMPLATE_DIR
 
-from ...utils import print_full_width
-
 
 class PDFViaLatexExporter(BaseExporter):
     """
-    Exports notebooks to PDF files using LaTeX as an intermediary
-
-    Converts IPython notebooks to PDFs by first converting them into temporary TeX files that are then
-    converted to PDFs using nbconvert and pandoc. Pagebreaks, if enabled, are enforced with a custom
-    LaTeX template that clears the document to the next odd numbered page, resulting in responses that
-    are all two pages long.
+    An exporter that uses nbconvert's PDF exporter to convert notebooks to PDFs via LaTeX.
 
     Attributes:
         default_options (``dict``): the default options for this exporter

diff --git a/otter/export/utils.py b/otter/export/utils.py
diff --git a/otter/generate/__init__.py b/otter/generate/__init__.py
@@ -72,17 +72,17 @@ def to_dict(self):
 
         pip_deps = self.requirements if self.overwrite_requirements else [
             "datascience",
-            "jupyter_client", 
-            "ipykernel", 
-            "matplotlib", 
-            "pandas", 
-            "ipywidgets", 
-            "scipy", 
-            "seaborn", 
-            "scikit-learn", 
-            "jinja2", 
-            "nbconvert", 
-            "nbformat", 
+            "jupyter_client",
+            "ipykernel",
+            "matplotlib",
+            "pandas",
+            "ipywidgets",
+            "scipy",
+            "seaborn",
+            "scikit-learn",
+            "jinja2",
+            "nbconvert",
+            "nbformat",
             "dill",
             "numpy",
             "gspread",
@@ -125,9 +125,9 @@ def to_str(self):
 }
 
 
-def main(*, tests_dir="./tests", output_path="autograder.zip", config=None, no_config=False, 
-         lang=None, requirements=None, no_requirements=False, overwrite_requirements=False, 
-         environment=None, no_environment=False, username=None, password=None, token=None, files=[], 
+def main(*, tests_dir="./tests", output_path="autograder.zip", config=None, no_config=False,
+         lang=None, requirements=None, no_requirements=False, overwrite_requirements=False,
+         environment=None, no_environment=False, username=None, password=None, token=None, files=[],
          assignment=None, plugin_collection=None, python_version=None, channel_priority_strict=True):
     """
     Run Otter Generate.
@@ -143,7 +143,7 @@ def main(*, tests_dir="./tests", output_path="autograder.zip", config=None, no_c
         overwrite_requirements (``bool``): whether to overwrite the default requirements instead of
             adding to them
         environment (``str``): path to a conda environment file for this assignment
-        no_environment (``bool``): whether ``./environment.yml`` should be automatically checked if 
+        no_environment (``bool``): whether ``./environment.yml`` should be automatically checked if
             ``environment`` is unspecified
         username (``str``): a username for Gradescope for generating a token
         password (``str``): a password for Gradescope for generating a token
@@ -236,7 +236,7 @@ def main(*, tests_dir="./tests", output_path="autograder.zip", config=None, no_c
 
     # open requirements if it exists
     extra_requirements, r_requirements = [], None
-    with load_default_file(requirements, lang_config["requirements_filename"], 
+    with load_default_file(requirements, lang_config["requirements_filename"],
                            default_disabled=no_requirements,) as reqs:
         if reqs is not None:
             if ag_config.lang == "python":

diff --git a/otter/run/run_autograder/utils.py b/otter/run/run_autograder/utils.py
@@ -2,7 +2,7 @@
 
 from contextlib import contextmanager
 from io import StringIO
-from typing import Optional
+from typing import Iterable, Optional
 
 
 _OUTPUT: Optional[StringIO] = None
@@ -30,7 +30,7 @@ def write_blank_page_to_stare_at_before_you(path: str):
 
 
 @contextmanager
-def capture_run_output() -> StringIO:
+def capture_run_output() -> Iterable[StringIO]:
     """
     A context manager for capturing anything that Otter Run would normally print to stdout. Yields
     an ``io.StringIO`` object that the output will be written to.

diff --git a/requirements-export.txt b/requirements-export.txt
@@ -1,3 +1,3 @@
 nbconvert>=6.0.0
-pdfkit
+nbconvert[webpdf]
 pypdf