Permalink
Browse files

Initial import

- All scripts
- Readme, with license and instructions
  • Loading branch information...
0 parents commit 2f720fe234049038d852f4ac5c9bcc7bdbffba3c @weierophinney committed Jul 19, 2011
@@ -0,0 +1,76 @@
+DocBook5 Migration Tools
+========================
+
+This repository groups together a number of bash and PHP scripts I've used in
+order to convert DocBook 4 source files to DocBook 5. For a comprehensive
+writeup of the motivations and rationale behind the various choices made here,
+please read:
+
+* http://weierophinney.net/matthew/archives/264-Converting-DocBook4-to-DocBook5.html
+
+Requirements
+------------
+
+* bash or compatible shell
+* xsltproc
+* The db4-upgrade.xsl stylesheet; typically, installing libxml and/or xsltproc
+ will provide this
+* PHP >= 5.3.0, with the DOM extension enabled
+
+Usage
+-----
+
+To convert a single file:
+
+ prompt> path/to/docbook5-migration/bin/upgradeDocbook | tee -a error.log
+
+To convert a tree of XML files in bulk:
+
+ prompt> path/to/docbook5-migration/bin/upgradeDocbookBulk | tee -a error.log
+
+You can then grep the error.log for the word "FAIL" to see where failures
+occurred, and what caused them.
+
+Configuration
+-------------
+
+If you want to skip certain files, edit the `bin/upgradeDocbook` file, and
+update the "SKIPFILES" variable in it. Files listed in this string should not
+include any path information.
+
+Within this same file, you may provide alternate locations for such items as the
+`db4-upgrade.xsl` stylesheet and potentially the various PHP scripts invoked for
+transforming the converted files.
+
+Disclaimer
+----------
+
+These scripts *will* overwrite your files; make sure your files are under
+version control or that you have backups before using!
+
+License
+-------
+
+Copyright (c) 2011, Matthew Weier O'Phinney
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list
+of conditions and the following disclaimer.
+
+Redistributions in binary form must reproduce the above copyright notice, this
+list of conditions and the following disclaimer in the documentation and/or
+other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,45 @@
+<?php
+// File: docbook-programlistings.php
+
+// DOM notices are normal; report only warnings and above
+ini_set('display_errors', true);
+error_reporting(E_ALL ^ E_NOTICE);
+
+if ($argc < 2) {
+ fwrite(STDERR, "Missing file argument\n");
+ exit(1);
+}
+
+$file = $argv[1];
+if (!file_exists($file)) {
+ fwrite(STDERR, "Argument passed is not a file\n");
+ exit(1);
+}
+
+$doc = new DOMDocument();
+$doc->xmlVersion = "1.0";
+$doc->encoding = "utf-8";
+$doc->preserveWhitespace = true;
+$doc->formatOutput = true;
+
+if (!$doc->load($file)) {
+ fwrite(STDERR, "$file: UNABLE TO LOAD FILE!\n");
+ exit(1);
+}
+
+$changed = false;
+foreach ($doc->getElementsByTagName('programlisting') as $node) {
+ $content = $node->textContent;
+ $content = html_entity_decode($content, ENT_QUOTES, 'UTF-8');
+ $node->textContent = '';
+ $node->nodeValue = '';
+ $cdata = $doc->createCDATASection($content);
+ $node->appendChild($cdata);
+ $changed = true;
+}
+
+if (!$changed) {
+ exit(0);
+}
+
+$doc->save($file);
@@ -0,0 +1,38 @@
+<?php
+// File: docbook-replace-entities.php
+if ($argc < 2) {
+ fwrite(STDERR, "Missing file argument\n");
+ exit(1);
+}
+
+$file = $argv[1];
+if (!file_exists($file)) {
+ fwrite(STDERR, "Argument passed is not a file\n");
+ exit(1);
+}
+
+$xml = file_get_contents($file);
+
+// Check if we have a doctype, and, if so, place it in a separate file and
+// strip it from this one
+$transformed = preg_replace_callback(
+ '#(<!(DOCTYPE .*?)(]>))#s',
+ function ($matches) use ($file) {
+ $content = $matches[1];
+ $filename = $file . '.entities';
+ file_put_contents($filename, $content);
+ return '';
+ },
+ $xml
+);
+
+// Replace all entities with tokenized versions
+$transformed = preg_replace('/\&([a-zA-Z][a-zA-Z0-9._-]+;)/', '[amp]$1', $transformed);
+
+// If no transformations have been made, exit early
+if ($transformed == $xml) {
+ exit(0);
+}
+
+// Write the changes back to the file
+file_put_contents($file, $transformed);
@@ -0,0 +1,43 @@
+<?php
+// File: docbook-restore-entities.php
+if ($argc < 2) {
+ fwrite(STDERR, "Missing file argument\n");
+ exit(1);
+}
+
+$file = $argv[1];
+if (!file_exists($file)) {
+ fwrite(STDERR, "Argument passed is not a file\n");
+ exit(1);
+}
+
+$xml = file_get_contents($file);
+
+// Restore tokens with actual entities
+$transformed = preg_replace('/\[amp\]([a-zA-Z][a-zA-Z0-9._-]+;)/', '&$1', $xml);
+
+// Check if we have an entities file
+$entitiesFile = $file . '.entities';
+if (file_exists($entitiesFile)) {
+ // If so, insert the entities
+ $entities = file_get_contents($entitiesFile);
+ if (preg_match('#^<\?xml[^?]*\?>#', $transformed)) {
+ // If the file has an opening XML declaration, put the DOCTYPE/entities
+ // following it
+ $transformed = preg_replace('#^(<\?xml[^?]*\?>)#', '$1' . "\n" . $entities, $transformed);
+ } else {
+ // Otherwise, just prepend them
+ $transformed = $entities . "\n" . $transformed;
+ }
+
+ // Remove entities file when done
+ unlink($entitiesFile);
+}
+
+// If no transformations have been made, we can simply exit
+if ($transformed == $xml) {
+ exit(0);
+}
+
+// Write changes to disk
+file_put_contents($file, $transformed);
@@ -0,0 +1,18 @@
+<?php
+if ($argc < 2) {
+ fwrite(STDERR, "Missing file argument\n");
+ exit(1);
+}
+
+$file = $argv[1];
+if (!file_exists($file)) {
+ fwrite(STDERR, "Argument passed is not a file\n");
+ exit(1);
+}
+
+$xml = file_get_contents($file);
+if (0 !== strpos($xml, '<?xml')) {
+ $xml = '<?xml version="1.0" encoding="utf-8"?>' . "\n" . $xml;
+ // echo "Writing file " . $file . "\n";
+ file_put_contents($file, $xml);
+}
@@ -0,0 +1,107 @@
+#!/bin/bash
+# File: upgradeDocbook
+# vim: ft=sh
+if [ "$#" -ne 1 ];then
+ echo "USAGE: $0 <xml file>"
+ exit 1
+fi
+
+XMLFILE=$1
+
+# Customize the following based on your system
+
+# Location of the db4-upgrade.xsl
+UPGRADE_XSL=/usr/share/xml/docbook/stylesheet/docbook5/db4-upgrade.xsl
+
+# Location of the PHP script for replacing entities
+ENT_REPLACE_SCRIPT=`dirname $0`/docbook-replace-entities.php
+
+# Location of the PHP script for restoring entities
+ENT_RESTORE_SCRIPT=`dirname $0`/docbook-restore-entities.php
+
+# Location of the PHP script for adding XML declarations
+XML_INTRO_SCRIPT=`dirname $0`/docbook-xml-intro.php
+
+# Location of the PHP script for processing programlistings
+XML_PL_SCRIPT=`dirname $0`/docbook-programlistings.php
+
+# Provide a space-separated list of files that should be skipped
+SKIPFILES="language-snippets.xml"
+
+# Begin
+
+echo "Processing $XMLFILE"
+
+# Check if this is a skipfile
+for i in $SKIPFILES;do
+ if [[ `basename "$XMLFILE"` = $i ]];then
+ echo " Skipping; file is in skip list"
+ exit
+ fi
+done
+
+# Copy the file to a temporary location
+WORKFILE="$XMLFILE.transform"
+cp $XMLFILE $WORKFILE
+
+# Replace entities
+printf "%-64s" " Replacing XML entities..."
+php $ENT_REPLACE_SCRIPT $WORKFILE
+if [ "$?" -ne "0" ];then
+ printf " %7s\n" "[FAIL]"
+ echo " FAILED: Replacing XML entities in $XMLFILE" >&2
+ exit 1
+fi
+printf " %7s\n" "[DONE]"
+
+printf "%-64s\n" " Converting from DocBook 4 to 5..."
+xsltproc $UPGRADE_XSL $WORKFILE > $WORKFILE.db5
+if [ `stat --print="%s" $WORKFILE.db5` -lt 200 ];then
+ printf " %7s\n" "[FAIL]"
+ echo " FAILED: Conversion of $XMLFILE" >&2
+ exit 1
+fi
+printf " %7s\n" "[DONE]"
+
+# Overwrite working file with transformed content
+mv $WORKFILE.db5 $WORKFILE
+
+# Restore entities
+printf "%-64s" " Restoring XML entities..."
+php $ENT_RESTORE_SCRIPT $WORKFILE
+if [ "$?" -ne "0" ];then
+ printf " %7s\n" "[FAIL]"
+ echo " FAILED: Restoring XML entities in $XMLFILE" >&2
+ exit 1
+fi
+printf " %7s\n" "[DONE]"
+
+printf "%-64s" " Stripping conversion comment..."
+sed --regexp-extended --in-place 's/<!-- Converted by db4-upgrade version 1.0 -->//' $WORKFILE
+if [ "$?" -ne "0" ];then
+ printf " %7s\n" "[FAIL]"
+ echo " FAILED: Stripping DB4 conversion comments in $XMLFILE" >&2
+ exit 1
+fi
+printf " %7s\n" "[DONE]"
+
+printf "%-64s" " Adding XML declaration..."
+php $XML_INTRO_SCRIPT $WORKFILE
+if [ "$?" -ne "0" ];then
+ printf " %7s\n" "[FAIL]"
+ echo " FAILED: Adding XML declaration in $XMLFILE" >&2
+ exit 1
+fi
+printf " %7s\n" "[DONE]"
+
+printf "%-64s" " Fixing programlisting blocks..."
+php $XML_PL_SCRIPT $WORKFILE 1>&2
+if [ "$?" -ne "0" ];then
+ printf " %7s\n" "[FAIL]"
+ echo " FAILED: Fixing program listings in $XMLFILE" >&2
+ exit 1
+fi
+printf " %7s\n" "[DONE]"
+
+mv $WORKFILE $XMLFILE
+exit 0
@@ -0,0 +1,25 @@
+#!/bin/bash
+# File: upgradeDocbookBulk
+# vim: ft=sh
+XMLDIR=`pwd`
+if [ "$#" -ge 1 ];then
+ XMLDIR=$1
+fi
+
+echo "STARTING DOCBOOK CONVERSION"
+
+SCRIPTDIR=`dirname $0`
+ERRORS=0
+for f in `find $XMLDIR -name '*.xml'`
+do
+ $SCRIPTDIR/upgradeDocbook $f
+ if [ "$?" -ne "0" ];then
+ ERRORS=1
+ fi
+done
+
+echo "[DONE]"
+if [ "$ERRORS" -eq "1" ];then
+ echo "Script completed with errors; check logs for details."
+ exit 1
+fi

0 comments on commit 2f720fe

Please sign in to comment.