version 0.1

vlunot · Aug 17, 2018 · 0469341 · 0469341
commit 0469341
Show file tree

Hide file tree

Showing 18 changed files with 702 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,15 @@
+# Compiled python modules.
+*.pyc
+
+# Setuptools distribution folder.
+/dist/
+
+# Python egg metadata, regenerated from source files by setuptools.
+/*.egg-info
+/*.egg
+
+# Jupyter lab checkpoints.
+.ipynb_checkpoints
+
+# Pytest cache
+.pytest_cache
diff --git a/LICENSE.txt b/LICENSE.txt
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2018 Vincent Lunot
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,2 @@
+include README.md
+include LICENSE.txt
diff --git a/README.md b/README.md
@@ -0,0 +1,56 @@
+# nb2hugo
+
+*nb2hugo* is a simple way to convert a Jupyter notebook to a Hugo markdown page.
+
+
+## Motivation
+
+Jupyter Notebook is a great way to create a single document that contains code that can be executed, formatted text to provide detailed explanations, as well as figures. Hugo is a simple yet very powerful static site generator. While a few solutions to convert a Jupyter notebook to a Hugo markdown with front matter already exist, *nb2hugo* put an emphasis on getting a result that looks similar to the original Jupyter notebook.
+
+
+## Installation
+
+Using pip:
+```
+pip install nb2hugo
+```
+
+
+## Usage
+
+In your python notebook, start by using one or more markdown cells that will contain the front matter information. Next, add an html comment as a front matter divider: everything in the notebook before the End Of Front Matter divider `<!--eofm-->` will be the front matter. This approach is similar to the one used for [content summaries](https://gohugo.io/content-management/summaries/).  
+A markdown title before the `<!--eofm-->` divider will automatically become the front matter title. You can also provide other front matter fields by writting pairs of "key: value" on different lines.  
+Below is an example of a notebook markdown cell that will become a front matter:
+
+```text
+# My notebook title
+
+Date: 2018-06-01  
+Author: firstname lastname  
+Categories: category1, category2  
+Tags: tag1, tag2, tag3  
+<!--eofm-->
+```
+
+All content after the `<!--eofm-->` divider will be considered as normal notebook content.
+
+Once you have finished writing your notebook, you can convert it using the following command:
+
+```bash
+nb2hugo notebook_file --site-dir hugo_website_directory --section content_section
+```
+
+
+## Author
+
+**Vincent Lunot** - *Initial work*
+
+
+## License
+
+This project is licensed under the MIT License - see the [LICENSE.txt](LICENSE.txt) file for details.
+
+
+## Acknowledgements
+
+*nb2hugo* is based on [nbconvert](https://github.com/jupyter/nbconvert)
diff --git a/bin/nb2hugo b/bin/nb2hugo
@@ -0,0 +1,27 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import argparse
+import os
+from nb2hugo.writer import HugoWriter
+
+
+def parse_arguments():
+    """Define and parse the script arguments."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument('notebook', help='Jupyter notebook filename')
+    parser.add_argument('--site-dir', required=True, 
+                        help='path to hugo site directory')
+    parser.add_argument('--section', required=True, 
+                        help='content section where to create markdown')
+    args = parser.parse_args()
+    _, ext = os.path.splitext(os.path.basename(args.notebook))
+    if ext != '.ipynb':
+        parser.error('Notebook is expected to have a .ipynb extension.')
+    return args.notebook, args.site_dir, args.section
+
+
+if __name__ == '__main__':
+    notebook, site_dir, section = parse_arguments()
+    writer = HugoWriter()
+    writer.convert(notebook, site_dir, section)
diff --git a/nb2hugo/__init__.py b/nb2hugo/__init__.py
diff --git a/nb2hugo/exporter.py b/nb2hugo/exporter.py
@@ -0,0 +1,18 @@
+from nbconvert.exporters import MarkdownExporter
+from traitlets import List
+from .preprocessors import (FrontMatterPreprocessor, FixLatexPreprocessor,
+                            ImagesPreprocessor, RawPreprocessor)
+
+class HugoExporter(MarkdownExporter):
+    """Export a Jupyter notebook to a pair of markdown and resources
+    compatible with Hugo.
+    """
+
+    preprocessors = List([
+            FrontMatterPreprocessor,
+            FixLatexPreprocessor,
+            RawPreprocessor,
+            ImagesPreprocessor,
+        ],
+        help="""List of preprocessors, by name or namespace, to enable."""
+    ).tag(config=True)
diff --git a/nb2hugo/preprocessors/__init__.py b/nb2hugo/preprocessors/__init__.py
@@ -0,0 +1,4 @@
+from .fixlatex import FixLatexPreprocessor
+from .frontmatter import FrontMatterPreprocessor
+from .images import ImagesPreprocessor
+from .raw import RawPreprocessor
diff --git a/nb2hugo/preprocessors/fixlatex.py b/nb2hugo/preprocessors/fixlatex.py
@@ -0,0 +1,59 @@
+from nbconvert.preprocessors import Preprocessor
+import re
+
+
+class FixLatexPreprocessor(Preprocessor):
+    """Preprocess the notebook markdown cells:
+    - convert $$ ... $$ to \\[ ... \\],
+    - convert $ ... $ to \\( ... \\),
+    - double escape underscores inside latex content.
+    
+    See https://gohugo.io/content-management/formats/#issues-with-markdown
+    for some issues with latex.
+    """
+
+    def preprocess_cell(self, cell, resources, index):
+        """Preprocess a notebook cell."""
+        if cell.cell_type == "markdown":
+            cell.source = self._replace_latex_enclosing_dollars(cell.source)
+            cell.source = self._fix_latex_antislash(cell.source)
+        return cell, resources
+
+    def _replace_latex_enclosing_dollars(self, text):
+        """Convert LaTeX $ ... $ or $$ ... $$ expressions to respectively 
+        \\( ... \\) and \\[ ... \\].
+        """
+        single_dollar_latex = r'(?<![\\\$])\$(?!\$)(.+?)(?<![\\\$])\$(?!\$)'
+        to_parentheses = lambda m: r'\\(' + m.group(1) + r'\\)'
+        no_single_dollar = re.sub(single_dollar_latex, to_parentheses, text,
+                                  flags=re.S)
+        double_dollar_latex = r'\$\$(.+?)\$\$'
+        to_brackets = lambda m: r'\\[' + m.group(1) + r'\\]'
+        no_single_or_double_dollar = re.sub(double_dollar_latex, to_brackets, 
+                                            no_single_dollar, flags=re.S)
+        return no_single_or_double_dollar
+
+    def _fix_latex_escaped_underscores(self, text):
+        """Replace '\_' by '\\_' inside LaTeX expressions delimited by
+        \\[ ... \\] or \\( ... \\)."""
+        inline_math = r'\\\((.+?)\\\)'
+        display_math = r'\\\[(.+?)\\\]'
+        double_escape = lambda m: re.sub(r'(?<!\\)\\_', r'\\\\_', m.group(0))
+        new_text = re.sub(inline_math, double_escape, text, flags=re.S)
+        new_text = re.sub(display_math, double_escape, new_text, flags=re.S)
+        return new_text
+
+    def _fix_latex_antislash(self, text):
+        """Replace '\\' by '\\\\\\' and '\' by '\\' inside LaTeX expressions 
+        delimited by \\[ ... \\] or \\( ... \\)."""
+        inline_math = r'\\\\\((.+?)\\\\\)'
+        display_math = r'\\\\\[(.+?)\\\\\]'
+        multiple_escape = lambda m: r'\\(' + re.sub(r'\\\\', r'\\\\\\\\\\', m.group(1)) + r'\\)'
+        new_text = re.sub(inline_math, multiple_escape, text, flags=re.S)
+        multiple_escape = lambda m: r'\\[' + re.sub(r'\\\\', r'\\\\\\\\\\', m.group(1)) + r'\\]'
+        new_text = re.sub(display_math, multiple_escape, new_text, flags=re.S)
+        double_escape = lambda m: r'\\(' + re.sub(r'(?<!\\)\\(?!\\)', r'\\\\', m.group(1)) + r'\\)'
+        new_text = re.sub(inline_math, double_escape, new_text, flags=re.S)
+        double_escape = lambda m: r'\\[' + re.sub(r'(?<!\\)\\(?!\\)', r'\\\\', m.group(1)) + r'\\]'
+        new_text = re.sub(display_math, double_escape, new_text, flags=re.S)
+        return new_text
diff --git a/nb2hugo/preprocessors/frontmatter.py b/nb2hugo/preprocessors/frontmatter.py
@@ -0,0 +1,80 @@
+from nbconvert.preprocessors import Preprocessor
+from nbformat import notebooknode
+import warnings
+
+
+class FrontMatterPreprocessor(Preprocessor):
+    """Preprocess the notebook front matter:
+    - all the markdown cells before a <!--eofm--> divider
+    will be considered as part of the front matter and transformed 
+    into a unique cell containing a toml front matter
+    - all raw cells or code cells before the <!--eofm--> divider will
+    be removed
+    - all cells after the <!--eofm--> divider will be kept as is.
+    """
+
+    def preprocess(self, nb, resources):
+        """Execute the preprocessing of the notebook."""
+        frontmatter, content_cells = self._split_frontmatter(nb)   
+        nb.cells = []
+        if frontmatter:
+            toml_fm = self._toml_frontmatter(frontmatter)
+            nb.cells.append(self._markdown_cell(toml_fm))
+        nb.cells += content_cells
+        return nb, resources
+
+    def _split_frontmatter(self, nb):
+        """Return a pair whose first element is a string containing
+        the frontmatter and whose second element is a list of all the
+        content cells."""
+        frontmatter = ''
+        for index, cell in enumerate(nb.cells):
+            if cell.cell_type == "markdown":
+                split = cell.source.split('<!--eofm-->', 1)
+                if len(split) > 1: # eofm divider is in the cell
+                    fm_part, content_part = split
+                    frontmatter += fm_part
+                    if content_part.strip():
+                        first_content_cell = [self._markdown_cell(content_part)]
+                    else: 
+                        first_content_cell = []
+                    return frontmatter, first_content_cell+nb.cells[index + 1:]
+                else:
+                    # the entire cell content is part of the front matter
+                    frontmatter += cell.source
+        warnings.warn('Notebook does not have a front matter.')
+        return '', nb.cells
+
+    def _toml_frontmatter(self, nb_fm):
+        """Convert a notebook front matter into a toml front matter.
+        
+        Example:
+        >>> toml_frontmatter('# Notebook title\nDate: 2018-06-10')
+        '+++\ntitle = "Notebook title"\ndate = "2018-06-10"\n+++\n'
+        """
+        toml_fm = '+++\n'
+        for line in nb_fm.split('\n'):
+            stripped = line.strip()
+            if stripped:
+                if stripped.startswith('# '): # The line contains the title
+                    toml_fm += 'title = "' + stripped[2:].strip() + '"\n'
+                else: # The line is expected to contain a field "key: value0, value1, ..."
+                    s = stripped.split(':', 1)
+                    if len(s) < 2:
+                        warnings.warn(f'This content is not formatted correctly and is ignored: {stripped}')
+                        continue
+                    key, values = s
+                    key = key.lower()
+                    values = [value.strip() for value in values.split(',')]
+                    if len(values) > 1: # The field has multiple values (e.g. multiple tags)
+                        toml_fm += key + ' = [' + ', '.join([f'"{value.strip()}"' for value in values]) + ']\n'
+                    else: # The field has a single value (e.g. date)
+                        toml_fm += f'{key} = "{values[0]}"\n'
+        toml_fm += '+++\n'
+        return toml_fm
+
+    def _markdown_cell(self, source):
+        """Create a markdown cell with source content."""
+        return notebooknode.from_dict({"cell_type": "markdown",
+                                       "metadata": {},
+                                       "source": source})
diff --git a/nb2hugo/preprocessors/images.py b/nb2hugo/preprocessors/images.py
@@ -0,0 +1,38 @@
+from nbconvert.preprocessors import Preprocessor
+import re
+import os
+import urllib.request
+
+
+class ImagesPreprocessor(Preprocessor):
+    """Preprocess the notebook markdown cells:
+    - copy images to static directory,
+    - update image link.
+    """
+
+    def preprocess(self, nb, resources):
+        """Preprocess the entire notebook."""
+        if not 'images_path' in resources:
+            resources['images_path'] = {}    
+        for index, cell in enumerate(nb.cells):
+            nb.cells[index], resources = self.preprocess_cell(cell, resources, index)
+        return nb, resources
+
+    def preprocess_cell(self, cell, resources, index):
+        """Preprocess one cell."""
+        if cell.cell_type == "markdown":
+            # Find and process links:
+            process_match = lambda m: self._process_image_link(m.group(1), m.group(2), resources)
+            cell.source = re.sub('!\[([^"]*?)\]\(([^"]+?)\)', process_match, cell.source)
+        return cell, resources
+
+    def _process_image_link(self, alt_text, url, resources):
+        """Copy image and return updated link."""
+        url_as_path = os.path.join(resources['metadata']['path'], url)
+        if os.path.isfile(url_as_path):
+            filename = os.path.basename(url)
+            resources['images_path'][filename] = url_as_path
+            link = '![' + alt_text + '](' + filename + ')'
+        else:
+            link = '![' + alt_text + '](' + url + ')'
+        return link
diff --git a/nb2hugo/preprocessors/raw.py b/nb2hugo/preprocessors/raw.py
@@ -0,0 +1,10 @@
+from nbconvert.preprocessors import Preprocessor
+
+class RawPreprocessor(Preprocessor):
+    """Preprocess the notebook raw cells and convert them into plain text."""
+
+    def preprocess_cell(self, cell, resources, index):
+        """Preprocess a notebook cell."""
+        if cell.cell_type == "raw":
+            cell.source = '```\n' + cell.source + '\n```'
+        return cell, resources
diff --git a/nb2hugo/writer.py b/nb2hugo/writer.py
@@ -0,0 +1,51 @@
+import os
+import shutil
+from .exporter import HugoExporter
+
+
+class HugoWriter:
+    """A configurable writer to create Hugo markdown from a Jupyter notebook."""
+
+    def __init__(self, config=None):
+        self._exporter = HugoExporter(config)
+
+    def convert(self, notebook, site_dir, section):
+        """Convert a Jupyter notebook into a Hugo markdown and write 
+        the result in the content section of the site located in site_dir.
+        """
+        (markdown, resources) = self._exporter.from_filename(notebook)
+        self._write_resources_images(resources, site_dir, section)
+        self._write_markdown(markdown, resources, site_dir, section)
+
+    def _write_resources_images(self, resources, site_dir, section):
+        """Process resources to create output images in static directory."""
+        name = resources['metadata']['name']
+        target_dir = os.path.join(site_dir, 'static', section, name)
+        if 'outputs' in resources:
+            if resources['outputs']:
+                os.makedirs(target_dir, exist_ok=True)
+            for key, value in resources['outputs'].items():
+                target = os.path.join(target_dir, key)     
+                with open(target, 'wb') as f:
+                    f.write(value)
+                    shortname = '/'.join(target.split('/')[-3:])
+                    print(f"Created '{shortname}'")
+        if 'images_path' in resources:
+            if resources['images_path']:
+                os.makedirs(target_dir, exist_ok=True)
+            for key, value in resources['images_path'].items():
+                target = os.path.join(target_dir, key)     
+                shutil.copy2(value, target)
+                shortname = '/'.join(target.split('/')[-3:])
+                print(f"Created '{shortname}'")
+
+    def _write_markdown(self, markdown, resources, site_dir, section):
+        """Save markdown to file."""
+        name = resources['metadata']['name']
+        target_dir = os.path.join(site_dir, 'content', section)
+        os.makedirs(target_dir, exist_ok=True)
+        target = os.path.join(target_dir, f'{name}.md')
+        with open(target, 'w') as f:
+            f.write(markdown)
+            shortname = '/'.join(target.split('/')[-2:])
+            print(f"Created '{shortname}'")