Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Develop #6

Merged
merged 4 commits into from
Sep 24, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,6 @@
.idea/
*.iml
*.pyc
*.egg-info/
build/
dist/
15 changes: 0 additions & 15 deletions README.md

This file was deleted.

75 changes: 75 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
master:

.. image:: https://travis-ci.org/unfoldingWord-dev/obs_tools.svg?branch=master
:alt: Build Status
:target: https://travis-ci.org/unfoldingWord-dev/obs_tools

.. image:: https://coveralls.io/repos/github/unfoldingWord-dev/obs_tools/badge.svg?branch=master
:alt: Build Status
:target: https://coveralls.io/github/unfoldingWord-dev/obs_tools

develop:

.. image:: https://travis-ci.org/unfoldingWord-dev/obs_tools.svg?branch=develop
:alt: Build Status
:target: https://travis-ci.org/unfoldingWord-dev/obs_tools

.. image:: https://coveralls.io/repos/github/unfoldingWord-dev/obs_tools/badge.svg?branch=develop
:alt: Build Status
:target: https://coveralls.io/github/unfoldingWord-dev/obs_tools

unfoldingWord OBS Tools
=======================

A collection of Python scripts that have proven useful and have been reused.

All code should be compatible with Python 2.7 and 3.5

**To use this library, install it in your Python environment like this:**

::

pip install obs-tools


**To install a particular version (tag, branch or commit) use this:**

::

pip install git+git://github.com/unfoldingWord-dev/obs_tools.git@Tag-Branch-or-Commit#egg=obs_tools


Submitting to pypi
******************

**Add the library to pypi if you haven't already.**

1. Run ``python setup.py sdist bdist_wheel --universal``.
2. Go to https://pypi.python.org/pypi?%3Aaction=submit_form
3. Click "Choose File" and pick ``obs_tools.egg-info/PKG-INFO``, then click "Add Package Info."

**Install twine**

::

sudo pip install twine

**Create settings file ``~/.pypirc`` with these contents:**

::

[distutils]
index-servers=pypi

[pypi]
repository = https://upload.pypi.org/legacy/
username = <USER-NAME>
password = <PASSWORD>

**Generate the packages and upload**

::

python setup.py sdist bdist_wheel --universal
twine upload dist/*

36 changes: 36 additions & 0 deletions cli/import_from_dw_to_git.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/usr/bin/env python2
# -*- coding: utf8 -*-
#
# Copyright (c) 2016 unfoldingWord
# http://creativecommons.org/licenses/MIT/
# See LICENSE file for details.
#
# Contributors:
# Phil Hopper <phillip_hopper@wycliffeassociates.org>
#

from __future__ import print_function, unicode_literals
import argparse
import sys
from general_tools.print_utils import print_ok
from obs.importer.from_dokuwiki import OBSDokuwikiImporter


if __name__ == '__main__':
print()
parser = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-l', '--lang', dest='lang', default=False,
required=True, help='Language code of resource.')
parser.add_argument('-r', '--gitrepo', dest='gitrepo', default=False,
required=True, help='Git repository where the source can be found.')
parser.add_argument('-o', '--outdir', dest='outdir', default=False,
required=True, help='The output directory for markdown files.')

args = parser.parse_args(sys.argv[1:])

# do the import
with OBSDokuwikiImporter(args.lang, args.gitrepo, args.outdir, False) as importer:
importer.run()

print_ok('ALL FINISHED: ', 'Please check the output directory.')
Empty file added obs/importer/__init__.py
Empty file.
202 changes: 202 additions & 0 deletions obs/importer/from_dokuwiki.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
from __future__ import print_function, unicode_literals
import json
import os
import re
from general_tools.file_utils import write_file
from obs.obs_classes import OBS, OBSManifest, OBSManifestEncoder, OBSSourceTranslation
from general_tools.url_utils import get_languages, join_url_parts, get_url


class OBSDokuwikiImporter(object):

# regular expressions for replacing Dokuwiki formatting
h1_re = re.compile(r'====== (.*?) ======', re.UNICODE)
h2_re = re.compile(r'===== (.*?) =====', re.UNICODE)
h3_re = re.compile(r'==== (.*?) ====', re.UNICODE)
h4_re = re.compile(r'=== (.*?) ===', re.UNICODE)
h5_re = re.compile(r'== (.*?) ==', re.UNICODE)
italic_re = re.compile(r'[^:]//(.*?)//', re.UNICODE)
bold_re = re.compile(r'\*\*(.*?)\*\*', re.UNICODE)
image_re = re.compile(r'\{\{(.*?)\}\}', re.UNICODE)
link_re = re.compile(r'\[\[(http[s]*:[^:]*)\|(.*?)\]\]', re.UNICODE)
li_re = re.compile(r'[ ]{1,3}(\*)', re.UNICODE)
li_space_re = re.compile(r'^(\*.*\n)\n(?=\*)', re.UNICODE + re.MULTILINE)

# regular expressions for removing text formatting
html_tag_re = re.compile(r'<.*?>', re.UNICODE)
link_tag_re = re.compile(r'\[\[.*?\]\]', re.UNICODE)

def __init__(self, lang_code, git_repo, out_dir, quiet):
"""

:param unicode lang_code:
:param unicode git_repo:
:param unicode out_dir:
:param bool quiet:
"""
self.git_repo = git_repo
self.out_dir = out_dir
self.quiet = quiet
# self.temp_dir = ''

if 'github' not in git_repo:
raise Exception('Currently only github repositories are supported.')

# get the language data
try:
self.quiet_print('Downloading language data...', end=' ')
langs = get_languages()
finally:
self.quiet_print('finished.')

self.lang_data = next((l for l in langs if l['lc'] == lang_code), '')

if not self.lang_data:
raise Exception('Information for language "{0}" was not found.'.format(lang_code))

def __enter__(self):
return self

# noinspection PyUnusedLocal
def __exit__(self, exc_type, exc_val, exc_tb):
# delete temp files
# if os.path.isdir(self.temp_dir):
# shutil.rmtree(self.temp_dir, ignore_errors=True)
pass

def run(self):

lang_code = self.lang_data['lc']

# pre-flight checklist
if self.git_repo[-1:] == '/':
self.git_repo = self.git_repo[:-1]

# get the source files from the git repository
base_url = self.git_repo.replace('github.com', 'raw.githubusercontent.com')

# initialize
obs_obj = OBS()
obs_obj.direction = self.lang_data['ld']
obs_obj.language = lang_code

# download needed files from the repository
files_to_download = []
for i in range(1, 51):
files_to_download.append(str(i).zfill(2) + '.txt')

# download OBS story files
story_dir = os.path.join(self.out_dir, 'content')
for file_to_download in files_to_download:
self.download_obs_file(base_url, file_to_download, story_dir)

# download front and back matter
self.download_obs_file(base_url, 'front-matter.txt', os.path.join(self.out_dir, 'content', '_front'))
self.download_obs_file(base_url, 'back-matter.txt', os.path.join(self.out_dir, 'content', '_back'))

# get the status
uwadmin_dir = 'https://raw.githubusercontent.com/Door43/d43-en/master/uwadmin'
status = self.get_json_dict(join_url_parts(uwadmin_dir, lang_code, 'obs/status.txt'))
manifest = OBSManifest()
manifest.status['pub_date'] = status['publish_date']
manifest.status['contributors'] = re.split(r'\s*;\s*|\s*,\s*', status['contributors'])
manifest.status['checking_level'] = status['checking_level']
manifest.status['comments'] = status['comments']
manifest.status['version'] = status['version']
manifest.status['pub_date'] = status['publish_date']
manifest.status['checking_entity'] = re.split(r'\s*;\s*|\s*,\s*', status['checking_entity'])

source_translation = OBSSourceTranslation()
source_translation.language_slug = status['source_text']
source_translation.resource_slug = 'obs'
source_translation.version = status['source_text_version']

manifest.status['source_translations'].append(source_translation)

manifest.language['slug'] = lang_code
manifest.language['name'] = self.lang_data['ang']
manifest.language['dir'] = self.lang_data['ld']

manifest_str = json.dumps(manifest, sort_keys=False, indent=2, cls=OBSManifestEncoder)
write_file(os.path.join(self.out_dir, 'manifest.json'), manifest_str)

def download_obs_file(self, base_url, file_to_download, out_dir):

download_url = join_url_parts(base_url, 'master/obs', file_to_download)

try:
self.quiet_print('Downloading {0}...'.format(download_url), end=' ')
dw_text = get_url(download_url) # .decode('utf-8')

finally:
self.quiet_print('finished.')

self.quiet_print('Converting {0} to markdown...'.format(file_to_download), end=' ')
md_text = self.replace_dokuwiki_text(dw_text)
self.quiet_print('finished.')

save_as = os.path.join(out_dir, file_to_download.replace('.txt', '.md'))

self.quiet_print('Saving {0}...'.format(save_as), end=' ')
write_file(save_as, md_text)
self.quiet_print('finished.')

def replace_dokuwiki_text(self, text):
"""
Cleans up text from possible DokuWiki and HTML tag pollution.
:param str text:
:return: str
"""
text = text.replace('\r', '')
text = text.replace('\n\n\n\n\n', '\n\n')
text = text.replace('\n\n\n\n', '\n\n')
text = text.replace('\n\n\n', '\n\n')
text = self.h1_re.sub(r'# \1', text)
text = self.h2_re.sub(r'## \1', text)
text = self.h3_re.sub(r'### \1', text)
text = self.h4_re.sub(r'#### \1', text)
text = self.h5_re.sub(r'##### \1', text)
text = self.italic_re.sub(r'_\1_', text)
text = self.bold_re.sub(r'__\1__', text)
text = self.image_re.sub(r'![OBS Image](\1)', text)
text = self.link_re.sub(r'[\2](\1)', text)
text = self.li_re.sub(r'\1', text)
text = self.li_space_re.sub(r'\1', text)

old_url = 'https://api.unfoldingword.org/obs/jpg/1/en/'
cdn_url = 'https://cdn.door43.org/obs/jpg/'
text = text.replace(old_url, cdn_url)

return text

def clean_text(self, text):
"""
Cleans up text from possible DokuWiki and HTML tag pollution.
"""
if self.html_tag_re.search(text):
text = self.html_tag_re.sub('', text)
if self.link_tag_re.search(text):
text = self.link_tag_re.sub('', text)
return text

def get_json_dict(self, download_url):
return_val = {}
status_text = get_url(download_url)
status_text = status_text.replace('\r', '')
lines = filter(bool, status_text.split('\n'))

for line in lines:

if line.startswith('#') or line.startswith('\n') or line.startswith('{{') or ':' not in line:
continue

newline = self.clean_text(line)
k, v = newline.split(':', 1)
return_val[k.strip().lower().replace(' ', '_')] = v.strip()

return return_val

def quiet_print(self, message, end='\n'):

if not self.quiet:
print(message, end=end)
16 changes: 3 additions & 13 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,15 @@
import os
from setuptools import setup


# Utility function to read the README file.
# Used for the long_description. It's nice, because now 1) we have a top level
# README file and 2) it's easier to type in the README file than to put a raw
# string in below ...
def read(f_name):
return open(os.path.join(os.path.dirname(__file__), f_name)).read()


setup(
name="obs_tools",
version="0.0.1",
version="0.0.2",
author="unfoldingWord",
author_email="unfoldingword.org",
author_email="phillip_hopper@wycliffeassociates.org",
description="A collection of useful scripts",
license="MIT",
keywords="unfoldingWord obs tools",
url="https://github.org/unfoldingWord-dev/obs_tools",
packages=['obs'],
long_description=read('README.md'),
long_description='A collection of Python scripts that have proven useful and have been reused.',
classifiers=[]
)