Skip to content

Commit

Permalink
Merge d786954 into 11235f8
Browse files Browse the repository at this point in the history
  • Loading branch information
thePortus committed May 24, 2018
2 parents 11235f8 + d786954 commit b35297c
Show file tree
Hide file tree
Showing 6 changed files with 52 additions and 29 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ os: linux
dist: trusty
# set environment variables
env:
- PACKAGE_VERSION=0.0.4
- PACKAGE_VERSION=0.0.5
# install dependencies
install:
- pip install -r requirements/dev.txt
Expand Down
21 changes: 17 additions & 4 deletions dhelp/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@
]

NLTK_PACKAGES = {
'english': [
'all': [
('punkt', ['tokenizers', 'punkt.zip']),
('verbnet', ['corpora', 'verbnet.zip']),
('wordnet', ['corpora', 'wordnet.zip']),
('words', ['corpora', 'words.zip']),
('large_grammars', ['grammars', 'large_grammars.zip']),
('large_grammars', ['grammars', 'large_grammars.zip']),
(
'averaged_perceptron_tagger',
Expand All @@ -24,12 +25,24 @@
'maxent_treebank_pos_tagger',
['taggers', 'maxent_treebank_pos_tagger.zip']
),
('universal_tagset', ['taggers', 'universal_tagset.zip']),
('punkt', ['tokenizers', 'punkt.zip']),
('maxent_ne_chunker', ['chunkers', 'maxent_ne_chunker.zip']),
('universal_tagset', ['taggers', 'universal_tagset.zip']),
],
'english': [
('words', ['corpora', 'words.zip']),
('sample_grammars', ['grammars', 'sample_grammars.zip']),
('book_grammars', ['grammars', 'book_grammars.zip']),
('perluniprops', ['misc', 'perluniprops.zip'])
],
'spanish': [
('spanish_grammars', ['grammars', 'spanish_grammars.zip'])
],
'basque': [
('basque_grammars', ['grammars', 'basque_grammars.zip'])
]
}

# TODO: Change CLTK setup so it expects path segments like NLTK settings
CLTK_PACKAGES = {
'greek': [
('greek_software_tlgu', 'software/greek_software_tlgu'),
Expand Down
15 changes: 9 additions & 6 deletions dhelp/text/_bases.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,18 @@ class BaseText(UserString):
>>> print(text)
'Lorem ipsum dolor sit amet...'
""" # noqa
options = {
'encoding': 'utf-8',
'language': 'english'
}

def __init__(self, text, options={}):
def __init__(self, text, *args, **kwargs):
super().__init__(str)
if 'encoding' not in options:
options['encoding'] = 'utf-8'
if 'language' not in options:
options['language'] = 'english'
# update .options if options keyword arg passed
if 'options' in kwargs:
if type(kwargs['options']) == dict:
self.options.update(kwargs['options'])
self.data = text
self.options = options

def __enter__(self):
pass
Expand Down
16 changes: 8 additions & 8 deletions dhelp/text/cltk.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,10 +256,10 @@ class LatinText(CLTKMixin, BaseText):
>>> print(text.lemmatize())
gallia edo1 omne divido in pars tres
"""

def __init__(self, text, options={}):
options['language'] = 'latin'
super().__init__(text=text, options=options)
options = {
'encoding': 'utf-8',
'language': 'latin'
}

def macronize(self, mode='tag_ngram_123_backoff'):
"""Adds macrons (long vowel marks).
Expand Down Expand Up @@ -366,10 +366,10 @@ class AncientGreekText(CLTKMixin, BaseText):
>>> print(text.lemmatize())
εἰμί δὲ σύμπας οὗτος τὰ σύγγραμμα ἐκεῖνος μάλιστα οὐ ὠφέλιμος , ὅστις ὡς πρὸς οἶδα συγγράφω.
""" # noqa

def __init__(self, text, options={}):
options['language'] = 'greek'
super().__init__(text=text, options=options)
options = {
'encoding': 'utf-8',
'language': 'greek'
}

def normalize(self):
"""Fixes problems with differences in greek accent encoding.
Expand Down
25 changes: 16 additions & 9 deletions dhelp/text/nltk.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,20 +26,27 @@ class NLTKMixin:
"""

@classmethod
def setup(self):
def setup(cls):
"""Download NLTK packages and trainer corpora.
Launches the NLTK package download interface. Overridden by the CLTK
Launches the NLTK package download interface. Method is invoked by
child .setup() methods in NLTK classes. Method is overidden in CLTK
child classes to launch the automated CLTK downloader. Convenience
method if user has not already downloaded NLTK packages and trainer
sets.
Example:
>>> EnglishText.setup()
"""
for package, package_path_segments in settings.NLTK_PACKAGES[
'english'
]:
# start with common pkgs, a list of tuples each with...
# (1) pkg name (2) list of path segs where pkg data is stored locally
pkgs_and_path_segments = settings.NLTK_PACKAGES['all']
# join common list with language specific packages
for package_info in settings.NLTK_PACKAGES[cls.options['language']]:
pkgs_and_path_segments.append(package_info)
# loop through list of tuples, each with pkg name and path info
for package, package_path_segments in pkgs_and_path_segments:
# build the relative filepath to the data, specific to the os
package_path = os.sep.join(package_path_segments)
# will trigger error if no file, if file found, do nothing
try:
Expand Down Expand Up @@ -262,7 +269,7 @@ class EnglishText(NLTKMixin, BaseText):
>>> english_text.rm_lines().rm_nonchars().rm_spaces()
The quick brown fox jumped over the lazy dog
""" # noqa

def __init__(self, text, options={}):
options['language'] = 'english'
super().__init__(text=text, options=options)
options = {
'encoding': 'utf-8',
'language': 'english'
}
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
'author_email': 'dave.a.base@gmail.com',
'description': """DH Python tools for scraping web pages, pre-processing
data, and performing nlp analysis quickly.""",
'version': '0.0.4',
'version': '0.0.5',
'LICENSE': 'MIT',
'long_description': """Students often see great potential in Python for
historical analysis. But, before they see real payoff they often face too
Expand Down

0 comments on commit b35297c

Please sign in to comment.