Skip to content

Commit

Permalink
Import OBS from Dokuwiki to new Gogs format
Browse files Browse the repository at this point in the history
  • Loading branch information
phillip-hopper committed Sep 24, 2016
1 parent 330ec13 commit c5a4e74
Show file tree
Hide file tree
Showing 759 changed files with 22,314 additions and 27 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,6 @@
.idea/
*.iml
*.pyc
*.egg-info/
build/
dist/
15 changes: 0 additions & 15 deletions README.md

This file was deleted.

75 changes: 75 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
master:

.. image:: https://travis-ci.org/unfoldingWord-dev/obs_tools.svg?branch=master
:alt: Build Status
:target: https://travis-ci.org/unfoldingWord-dev/obs_tools

.. image:: https://coveralls.io/repos/github/unfoldingWord-dev/obs_tools/badge.svg?branch=master
:alt: Build Status
:target: https://coveralls.io/github/unfoldingWord-dev/obs_tools

develop:

.. image:: https://travis-ci.org/unfoldingWord-dev/obs_tools.svg?branch=develop
:alt: Build Status
:target: https://travis-ci.org/unfoldingWord-dev/obs_tools

.. image:: https://coveralls.io/repos/github/unfoldingWord-dev/obs_tools/badge.svg?branch=develop
:alt: Build Status
:target: https://coveralls.io/github/unfoldingWord-dev/obs_tools

unfoldingWord OBS Tools
=======================

A collection of Python scripts that have proven useful and have been reused.

All code should be compatible with Python 2.7 and 3.5

**To use this library, install it in your Python environment like this:**

::

pip install obs-tools


**To install a particular version (tag, branch or commit) use this:**

::

pip install git+git://github.com/unfoldingWord-dev/obs_tools.git@Tag-Branch-or-Commit#egg=obs_tools


Submitting to pypi
******************

**Add the library to pypi if you haven't already.**

1. Run ``python setup.py sdist bdist_wheel --universal``.
2. Go to https://pypi.python.org/pypi?%3Aaction=submit_form
3. Click "Choose File" and pick ``obs_tools.egg-info/PKG-INFO``, then click "Add Package Info."

**Install twine**

::

sudo pip install twine

**Create settings file ``~/.pypirc`` with these contents:**

::

[distutils]
index-servers=pypi

[pypi]
repository = https://upload.pypi.org/legacy/
username = <USER-NAME>
password = <PASSWORD>

**Generate the packages and upload**

::

python setup.py sdist bdist_wheel --universal
twine upload dist/*

36 changes: 36 additions & 0 deletions cli/import_from_dw_to_git.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/usr/bin/env python2
# -*- coding: utf8 -*-
#
# Copyright (c) 2016 unfoldingWord
# http://creativecommons.org/licenses/MIT/
# See LICENSE file for details.
#
# Contributors:
# Phil Hopper <phillip_hopper@wycliffeassociates.org>
#

from __future__ import print_function, unicode_literals
import argparse
import sys
from general_tools.print_utils import print_ok
from obs.importer.from_dokuwiki import OBSDokuwikiImporter


if __name__ == '__main__':
print()
parser = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-l', '--lang', dest='lang', default=False,
required=True, help='Language code of resource.')
parser.add_argument('-r', '--gitrepo', dest='gitrepo', default=False,
required=True, help='Git repository where the source can be found.')
parser.add_argument('-o', '--outdir', dest='outdir', default=False,
required=True, help='The output directory for markdown files.')

args = parser.parse_args(sys.argv[1:])

# do the import
with OBSDokuwikiImporter(args.lang, args.gitrepo, args.outdir, False) as importer:
importer.run()

print_ok('ALL FINISHED: ', 'Please check the output directory.')
Empty file added obs/importer/__init__.py
Empty file.
202 changes: 202 additions & 0 deletions obs/importer/from_dokuwiki.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
from __future__ import print_function, unicode_literals
import json
import os
import re
from general_tools.file_utils import write_file
from obs.obs_classes import OBS, OBSManifest, OBSManifestEncoder, OBSSourceTranslation
from general_tools.url_utils import get_languages, join_url_parts, get_url


class OBSDokuwikiImporter(object):

# regular expressions for replacing Dokuwiki formatting
h1_re = re.compile(r'====== (.*?) ======', re.UNICODE)
h2_re = re.compile(r'===== (.*?) =====', re.UNICODE)
h3_re = re.compile(r'==== (.*?) ====', re.UNICODE)
h4_re = re.compile(r'=== (.*?) ===', re.UNICODE)
h5_re = re.compile(r'== (.*?) ==', re.UNICODE)
italic_re = re.compile(r'[^:]//(.*?)//', re.UNICODE)
bold_re = re.compile(r'\*\*(.*?)\*\*', re.UNICODE)
image_re = re.compile(r'\{\{(.*?)\}\}', re.UNICODE)
link_re = re.compile(r'\[\[(http[s]*:[^:]*)\|(.*?)\]\]', re.UNICODE)
li_re = re.compile(r'[ ]{1,3}(\*)', re.UNICODE)
li_space_re = re.compile(r'^(\*.*\n)\n(?=\*)', re.UNICODE + re.MULTILINE)

# regular expressions for removing text formatting
html_tag_re = re.compile(r'<.*?>', re.UNICODE)
link_tag_re = re.compile(r'\[\[.*?\]\]', re.UNICODE)

def __init__(self, lang_code, git_repo, out_dir, quiet):
"""
:param unicode lang_code:
:param unicode git_repo:
:param unicode out_dir:
:param bool quiet:
"""
self.git_repo = git_repo
self.out_dir = out_dir
self.quiet = quiet
# self.temp_dir = ''

if 'github' not in git_repo:
raise Exception('Currently only github repositories are supported.')

# get the language data
try:
self.quiet_print('Downloading language data...', end=' ')
langs = get_languages()
finally:
self.quiet_print('finished.')

self.lang_data = next((l for l in langs if l['lc'] == lang_code), '')

if not self.lang_data:
raise Exception('Information for language "{0}" was not found.'.format(lang_code))

def __enter__(self):
return self

# noinspection PyUnusedLocal
def __exit__(self, exc_type, exc_val, exc_tb):
# delete temp files
# if os.path.isdir(self.temp_dir):
# shutil.rmtree(self.temp_dir, ignore_errors=True)
pass

def run(self):

lang_code = self.lang_data['lc']

# pre-flight checklist
if self.git_repo[-1:] == '/':
self.git_repo = self.git_repo[:-1]

# get the source files from the git repository
base_url = self.git_repo.replace('github.com', 'raw.githubusercontent.com')

# initialize
obs_obj = OBS()
obs_obj.direction = self.lang_data['ld']
obs_obj.language = lang_code

# download needed files from the repository
files_to_download = []
for i in range(1, 51):
files_to_download.append(str(i).zfill(2) + '.txt')

# download OBS story files
story_dir = os.path.join(self.out_dir, 'content')
for file_to_download in files_to_download:
self.download_obs_file(base_url, file_to_download, story_dir)

# download front and back matter
self.download_obs_file(base_url, 'front-matter.txt', os.path.join(self.out_dir, 'content', '_front'))
self.download_obs_file(base_url, 'back-matter.txt', os.path.join(self.out_dir, 'content', '_back'))

# get the status
uwadmin_dir = 'https://raw.githubusercontent.com/Door43/d43-en/master/uwadmin'
status = self.get_json_dict(join_url_parts(uwadmin_dir, lang_code, 'obs/status.txt'))
manifest = OBSManifest()
manifest.status['pub_date'] = status['publish_date']
manifest.status['contributors'] = re.split(r'\s*;\s*|\s*,\s*', status['contributors'])
manifest.status['checking_level'] = status['checking_level']
manifest.status['comments'] = status['comments']
manifest.status['version'] = status['version']
manifest.status['pub_date'] = status['publish_date']
manifest.status['checking_entity'] = re.split(r'\s*;\s*|\s*,\s*', status['checking_entity'])

source_translation = OBSSourceTranslation()
source_translation.language_slug = status['source_text']
source_translation.resource_slug = 'obs'
source_translation.version = status['source_text_version']

manifest.status['source_translations'].append(source_translation)

manifest.language['slug'] = lang_code
manifest.language['name'] = self.lang_data['ang']
manifest.language['dir'] = self.lang_data['ld']

manifest_str = json.dumps(manifest, sort_keys=False, indent=2, cls=OBSManifestEncoder)
write_file(os.path.join(self.out_dir, 'manifest.json'), manifest_str)

def download_obs_file(self, base_url, file_to_download, out_dir):

download_url = join_url_parts(base_url, 'master/obs', file_to_download)

try:
self.quiet_print('Downloading {0}...'.format(download_url), end=' ')
dw_text = get_url(download_url) # .decode('utf-8')

finally:
self.quiet_print('finished.')

self.quiet_print('Converting {0} to markdown...'.format(file_to_download), end=' ')
md_text = self.replace_dokuwiki_text(dw_text)
self.quiet_print('finished.')

save_as = os.path.join(out_dir, file_to_download.replace('.txt', '.md'))

self.quiet_print('Saving {0}...'.format(save_as), end=' ')
write_file(save_as, md_text)
self.quiet_print('finished.')

def replace_dokuwiki_text(self, text):
"""
Cleans up text from possible DokuWiki and HTML tag pollution.
:param str text:
:return: str
"""
text = text.replace('\r', '')
text = text.replace('\n\n\n\n\n', '\n\n')
text = text.replace('\n\n\n\n', '\n\n')
text = text.replace('\n\n\n', '\n\n')
text = self.h1_re.sub(r'# \1', text)
text = self.h2_re.sub(r'## \1', text)
text = self.h3_re.sub(r'### \1', text)
text = self.h4_re.sub(r'#### \1', text)
text = self.h5_re.sub(r'##### \1', text)
text = self.italic_re.sub(r'_\1_', text)
text = self.bold_re.sub(r'__\1__', text)
text = self.image_re.sub(r'![OBS Image](\1)', text)
text = self.link_re.sub(r'[\2](\1)', text)
text = self.li_re.sub(r'\1', text)
text = self.li_space_re.sub(r'\1', text)

old_url = 'https://api.unfoldingword.org/obs/jpg/1/en/'
cdn_url = 'https://cdn.door43.org/obs/jpg/'
text = text.replace(old_url, cdn_url)

return text

def clean_text(self, text):
"""
Cleans up text from possible DokuWiki and HTML tag pollution.
"""
if self.html_tag_re.search(text):
text = self.html_tag_re.sub('', text)
if self.link_tag_re.search(text):
text = self.link_tag_re.sub('', text)
return text

def get_json_dict(self, download_url):
return_val = {}
status_text = get_url(download_url)
status_text = status_text.replace('\r', '')
lines = filter(bool, status_text.split('\n'))

for line in lines:

if line.startswith('#') or line.startswith('\n') or line.startswith('{{') or ':' not in line:
continue

newline = self.clean_text(line)
k, v = newline.split(':', 1)
return_val[k.strip().lower().replace(' ', '_')] = v.strip()

return return_val

def quiet_print(self, message, end='\n'):

if not self.quiet:
print(message, end=end)
14 changes: 2 additions & 12 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,15 @@
import os
from setuptools import setup


# Utility function to read the README file.
# Used for the long_description. It's nice, because now 1) we have a top level
# README file and 2) it's easier to type in the README file than to put a raw
# string in below ...
def read(f_name):
return open(os.path.join(os.path.dirname(__file__), f_name)).read()


setup(
name="obs_tools",
version="0.0.1",
author="unfoldingWord",
author_email="unfoldingword.org",
author_email="phillip_hopper@wycliffeassociates.org",
description="A collection of useful scripts",
license="MIT",
keywords="unfoldingWord obs tools",
url="https://github.org/unfoldingWord-dev/obs_tools",
packages=['obs'],
long_description=read('README.md'),
long_description='A collection of Python scripts that have proven useful and have been reused.',
classifiers=[]
)
Loading

0 comments on commit c5a4e74

Please sign in to comment.