Import OBS from Dokuwiki to new Gogs format

unfoldingWord-dev · Sep 24, 2016 · c5a4e74 · c5a4e74
1 parent 330ec13
commit c5a4e74
Show file tree

Hide file tree

Showing 759 changed files with 22,314 additions and 27 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,6 @@
 .idea/
 *.iml
+*.pyc
+*.egg-info/
+build/
+dist/
diff --git a/README.md b/README.md
diff --git a/README.rst b/README.rst
@@ -0,0 +1,75 @@
+master:
+
+.. image:: https://travis-ci.org/unfoldingWord-dev/obs_tools.svg?branch=master
+:alt: Build Status
+    :target: https://travis-ci.org/unfoldingWord-dev/obs_tools
+
+.. image:: https://coveralls.io/repos/github/unfoldingWord-dev/obs_tools/badge.svg?branch=master
+:alt: Build Status
+    :target: https://coveralls.io/github/unfoldingWord-dev/obs_tools
+
+develop:
+
+.. image:: https://travis-ci.org/unfoldingWord-dev/obs_tools.svg?branch=develop
+:alt: Build Status
+    :target: https://travis-ci.org/unfoldingWord-dev/obs_tools
+
+.. image:: https://coveralls.io/repos/github/unfoldingWord-dev/obs_tools/badge.svg?branch=develop
+:alt: Build Status
+    :target: https://coveralls.io/github/unfoldingWord-dev/obs_tools
+
+unfoldingWord OBS Tools
+=======================
+
+A collection of Python scripts that have proven useful and have been reused.
+
+All code should be compatible with Python 2.7 and 3.5
+
+**To use this library, install it in your Python environment like this:**
+
+::
+
+    pip install obs-tools
+
+
+**To install a particular version (tag, branch or commit) use this:**
+
+::
+
+    pip install git+git://github.com/unfoldingWord-dev/obs_tools.git@Tag-Branch-or-Commit#egg=obs_tools
+
+
+Submitting to pypi
+******************
+
+**Add the library to pypi if you haven't already.**
+
+1. Run ``python setup.py sdist bdist_wheel --universal``.
+2. Go to https://pypi.python.org/pypi?%3Aaction=submit_form
+3. Click "Choose File" and pick ``obs_tools.egg-info/PKG-INFO``, then click "Add Package Info."
+
+**Install twine**
+
+::
+
+    sudo pip install twine
+
+**Create settings file ``~/.pypirc`` with these contents:**
+
+::
+
+    [distutils]
+    index-servers=pypi
+
+    [pypi]
+    repository = https://upload.pypi.org/legacy/
+    username = <USER-NAME>
+    password = <PASSWORD>
+
+**Generate the packages and upload**
+
+::
+
+    python setup.py sdist bdist_wheel --universal
+    twine upload dist/*
+
diff --git a/cli/import_from_dw_to_git.py b/cli/import_from_dw_to_git.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python2
+# -*- coding: utf8 -*-
+#
+#  Copyright (c) 2016 unfoldingWord
+#  http://creativecommons.org/licenses/MIT/
+#  See LICENSE file for details.
+#
+#  Contributors:
+#  Phil Hopper <phillip_hopper@wycliffeassociates.org>
+#
+
+from __future__ import print_function, unicode_literals
+import argparse
+import sys
+from general_tools.print_utils import print_ok
+from obs.importer.from_dokuwiki import OBSDokuwikiImporter
+
+
+if __name__ == '__main__':
+    print()
+    parser = argparse.ArgumentParser(description=__doc__,
+                                     formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument('-l', '--lang', dest='lang', default=False,
+                        required=True, help='Language code of resource.')
+    parser.add_argument('-r', '--gitrepo', dest='gitrepo', default=False,
+                        required=True, help='Git repository where the source can be found.')
+    parser.add_argument('-o', '--outdir', dest='outdir', default=False,
+                        required=True, help='The output directory for markdown files.')
+
+    args = parser.parse_args(sys.argv[1:])
+
+    # do the import
+    with OBSDokuwikiImporter(args.lang, args.gitrepo, args.outdir, False) as importer:
+        importer.run()
+
+    print_ok('ALL FINISHED: ', 'Please check the output directory.')
diff --git a/obs/importer/__init__.py b/obs/importer/__init__.py
diff --git a/obs/importer/from_dokuwiki.py b/obs/importer/from_dokuwiki.py
@@ -0,0 +1,202 @@
+from __future__ import print_function, unicode_literals
+import json
+import os
+import re
+from general_tools.file_utils import write_file
+from obs.obs_classes import OBS, OBSManifest, OBSManifestEncoder, OBSSourceTranslation
+from general_tools.url_utils import get_languages, join_url_parts, get_url
+
+
+class OBSDokuwikiImporter(object):
+
+    # regular expressions for replacing Dokuwiki formatting
+    h1_re = re.compile(r'====== (.*?) ======', re.UNICODE)
+    h2_re = re.compile(r'===== (.*?) =====', re.UNICODE)
+    h3_re = re.compile(r'==== (.*?) ====', re.UNICODE)
+    h4_re = re.compile(r'=== (.*?) ===', re.UNICODE)
+    h5_re = re.compile(r'== (.*?) ==', re.UNICODE)
+    italic_re = re.compile(r'[^:]//(.*?)//', re.UNICODE)
+    bold_re = re.compile(r'\*\*(.*?)\*\*', re.UNICODE)
+    image_re = re.compile(r'\{\{(.*?)\}\}', re.UNICODE)
+    link_re = re.compile(r'\[\[(http[s]*:[^:]*)\|(.*?)\]\]', re.UNICODE)
+    li_re = re.compile(r'[ ]{1,3}(\*)', re.UNICODE)
+    li_space_re = re.compile(r'^(\*.*\n)\n(?=\*)', re.UNICODE + re.MULTILINE)
+
+    # regular expressions for removing text formatting
+    html_tag_re = re.compile(r'<.*?>', re.UNICODE)
+    link_tag_re = re.compile(r'\[\[.*?\]\]', re.UNICODE)
+
+    def __init__(self, lang_code, git_repo, out_dir, quiet):
+        """
+
+        :param unicode lang_code:
+        :param unicode git_repo:
+        :param unicode out_dir:
+        :param bool quiet:
+        """
+        self.git_repo = git_repo
+        self.out_dir = out_dir
+        self.quiet = quiet
+        # self.temp_dir = ''
+
+        if 'github' not in git_repo:
+            raise Exception('Currently only github repositories are supported.')
+
+        # get the language data
+        try:
+            self.quiet_print('Downloading language data...', end=' ')
+            langs = get_languages()
+        finally:
+            self.quiet_print('finished.')
+
+        self.lang_data = next((l for l in langs if l['lc'] == lang_code), '')
+
+        if not self.lang_data:
+            raise Exception('Information for language "{0}" was not found.'.format(lang_code))
+
+    def __enter__(self):
+        return self
+
+    # noinspection PyUnusedLocal
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        # delete temp files
+        # if os.path.isdir(self.temp_dir):
+        #     shutil.rmtree(self.temp_dir, ignore_errors=True)
+        pass
+
+    def run(self):
+
+        lang_code = self.lang_data['lc']
+
+        # pre-flight checklist
+        if self.git_repo[-1:] == '/':
+            self.git_repo = self.git_repo[:-1]
+
+        # get the source files from the git repository
+        base_url = self.git_repo.replace('github.com', 'raw.githubusercontent.com')
+
+        # initialize
+        obs_obj = OBS()
+        obs_obj.direction = self.lang_data['ld']
+        obs_obj.language = lang_code
+
+        # download needed files from the repository
+        files_to_download = []
+        for i in range(1, 51):
+            files_to_download.append(str(i).zfill(2) + '.txt')
+
+        # download OBS story files
+        story_dir = os.path.join(self.out_dir, 'content')
+        for file_to_download in files_to_download:
+            self.download_obs_file(base_url, file_to_download, story_dir)
+
+        # download front and back matter
+        self.download_obs_file(base_url, 'front-matter.txt', os.path.join(self.out_dir, 'content', '_front'))
+        self.download_obs_file(base_url, 'back-matter.txt', os.path.join(self.out_dir, 'content', '_back'))
+
+        # get the status
+        uwadmin_dir = 'https://raw.githubusercontent.com/Door43/d43-en/master/uwadmin'
+        status = self.get_json_dict(join_url_parts(uwadmin_dir, lang_code, 'obs/status.txt'))
+        manifest = OBSManifest()
+        manifest.status['pub_date'] = status['publish_date']
+        manifest.status['contributors'] = re.split(r'\s*;\s*|\s*,\s*', status['contributors'])
+        manifest.status['checking_level'] = status['checking_level']
+        manifest.status['comments'] = status['comments']
+        manifest.status['version'] = status['version']
+        manifest.status['pub_date'] = status['publish_date']
+        manifest.status['checking_entity'] = re.split(r'\s*;\s*|\s*,\s*', status['checking_entity'])
+
+        source_translation = OBSSourceTranslation()
+        source_translation.language_slug = status['source_text']
+        source_translation.resource_slug = 'obs'
+        source_translation.version = status['source_text_version']
+
+        manifest.status['source_translations'].append(source_translation)
+
+        manifest.language['slug'] = lang_code
+        manifest.language['name'] = self.lang_data['ang']
+        manifest.language['dir'] = self.lang_data['ld']
+
+        manifest_str = json.dumps(manifest, sort_keys=False, indent=2, cls=OBSManifestEncoder)
+        write_file(os.path.join(self.out_dir, 'manifest.json'), manifest_str)
+
+    def download_obs_file(self, base_url, file_to_download, out_dir):
+
+        download_url = join_url_parts(base_url, 'master/obs', file_to_download)
+
+        try:
+            self.quiet_print('Downloading {0}...'.format(download_url), end=' ')
+            dw_text = get_url(download_url)  # .decode('utf-8')
+
+        finally:
+            self.quiet_print('finished.')
+
+        self.quiet_print('Converting {0} to markdown...'.format(file_to_download), end=' ')
+        md_text = self.replace_dokuwiki_text(dw_text)
+        self.quiet_print('finished.')
+
+        save_as = os.path.join(out_dir, file_to_download.replace('.txt', '.md'))
+
+        self.quiet_print('Saving {0}...'.format(save_as), end=' ')
+        write_file(save_as, md_text)
+        self.quiet_print('finished.')
+
+    def replace_dokuwiki_text(self, text):
+        """
+        Cleans up text from possible DokuWiki and HTML tag pollution.
+        :param str text:
+        :return: str
+        """
+        text = text.replace('\r', '')
+        text = text.replace('\n\n\n\n\n', '\n\n')
+        text = text.replace('\n\n\n\n', '\n\n')
+        text = text.replace('\n\n\n', '\n\n')
+        text = self.h1_re.sub(r'# \1', text)
+        text = self.h2_re.sub(r'## \1', text)
+        text = self.h3_re.sub(r'### \1', text)
+        text = self.h4_re.sub(r'#### \1', text)
+        text = self.h5_re.sub(r'##### \1', text)
+        text = self.italic_re.sub(r'_\1_', text)
+        text = self.bold_re.sub(r'__\1__', text)
+        text = self.image_re.sub(r'![OBS Image](\1)', text)
+        text = self.link_re.sub(r'[\2](\1)', text)
+        text = self.li_re.sub(r'\1', text)
+        text = self.li_space_re.sub(r'\1', text)
+
+        old_url = 'https://api.unfoldingword.org/obs/jpg/1/en/'
+        cdn_url = 'https://cdn.door43.org/obs/jpg/'
+        text = text.replace(old_url, cdn_url)
+
+        return text
+
+    def clean_text(self, text):
+        """
+        Cleans up text from possible DokuWiki and HTML tag pollution.
+        """
+        if self.html_tag_re.search(text):
+            text = self.html_tag_re.sub('', text)
+        if self.link_tag_re.search(text):
+            text = self.link_tag_re.sub('', text)
+        return text
+
+    def get_json_dict(self, download_url):
+        return_val = {}
+        status_text = get_url(download_url)
+        status_text = status_text.replace('\r', '')
+        lines = filter(bool, status_text.split('\n'))
+
+        for line in lines:
+
+            if line.startswith('#') or line.startswith('\n') or line.startswith('{{') or ':' not in line:
+                continue
+
+            newline = self.clean_text(line)
+            k, v = newline.split(':', 1)
+            return_val[k.strip().lower().replace(' ', '_')] = v.strip()
+
+        return return_val
+
+    def quiet_print(self, message, end='\n'):
+
+        if not self.quiet:
+            print(message, end=end)
diff --git a/setup.py b/setup.py
@@ -1,25 +1,15 @@
-import os
 from setuptools import setup
 
-
-# Utility function to read the README file.
-# Used for the long_description.  It's nice, because now 1) we have a top level
-# README file and 2) it's easier to type in the README file than to put a raw
-# string in below ...
-def read(f_name):
-    return open(os.path.join(os.path.dirname(__file__), f_name)).read()
-
-
 setup(
     name="obs_tools",
     version="0.0.1",
     author="unfoldingWord",
-    author_email="unfoldingword.org",
+    author_email="phillip_hopper@wycliffeassociates.org",
     description="A collection of useful scripts",
     license="MIT",
     keywords="unfoldingWord obs tools",
     url="https://github.org/unfoldingWord-dev/obs_tools",
     packages=['obs'],
-    long_description=read('README.md'),
+    long_description='A collection of Python scripts that have proven useful and have been reused.',
     classifiers=[]
 )