Merge pull request #7 from IEEE-SB-Passau/parser_config

Introduce IMAGE_PROCESS_PARSER config option
pelican-plugins · Jun 12, 2016 · 5f319ce · 5f319ce
2 parents 7b699e2 + 6521d14
commit 5f319ce
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 1 deletion.
diff --git a/Readme.rst b/Readme.rst
@@ -438,6 +438,20 @@ file.
    IMAGE_PROCESS_FORCE = True
 
 
+Selecting a HTML parser
+~~~~~~~~~~~~~~~~~~~~~~~
+
+You may select the HTML parser which is used. The default is the builtin
+``html.parser`` but you may also select ``html5lib`` or ``lxml`` by setting
+``IMAGE_PROCESS_PARSER`` in your pelican configuration file , e.g.:
+
+.. code-block:: python
+
+   IMAGE_PROCESS_PARSER = "html5lib"
+
+For details, refer to the `BeautifulSoup documentation on parsers
+<https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser>`_.
+
 Credits
 -------
 

diff --git a/image_process.py b/image_process.py
@@ -184,7 +184,8 @@ def harvest_images(path, context):
 
 
 def harvest_images_in_fragment(fragment, settings):
-    soup = BeautifulSoup(fragment, 'html.parser')
+    parser = settings.get("IMAGE_PROCESS_PARSER", "html.parser")
+    soup = BeautifulSoup(fragment, parser)
 
     for img in soup.find_all('img', class_=IMAGE_PROCESS_REGEX):
         for c in img['class']: