Skip to content

Commit

Permalink
setup for nltk/cltk fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
thePortus committed May 23, 2018
1 parent 4c163bc commit d786954
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 24 deletions.
9 changes: 4 additions & 5 deletions dhelp/text/_bases.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,13 @@ class BaseText(UserString):
>>> print(text)
'Lorem ipsum dolor sit amet...'
""" # noqa
options = {}
options = {
'encoding': 'utf-8',
'language': 'english'
}

def __init__(self, text, *args, **kwargs):
super().__init__(str)
self.options = {
'encoding': 'utf-8',
'language': 'english'
}
# update .options if options keyword arg passed
if 'options' in kwargs:
if type(kwargs['options']) == dict:
Expand Down
16 changes: 8 additions & 8 deletions dhelp/text/cltk.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,10 +256,10 @@ class LatinText(CLTKMixin, BaseText):
>>> print(text.lemmatize())
gallia edo1 omne divido in pars tres
"""

def __init__(self, text, options={}):
options['language'] = 'latin'
super().__init__(text=text, options=options)
options = {
'encoding': 'utf-8',
'language': 'latin'
}

def macronize(self, mode='tag_ngram_123_backoff'):
"""Adds macrons (long vowel marks).
Expand Down Expand Up @@ -366,10 +366,10 @@ class AncientGreekText(CLTKMixin, BaseText):
>>> print(text.lemmatize())
εἰμί δὲ σύμπας οὗτος τὰ σύγγραμμα ἐκεῖνος μάλιστα οὐ ὠφέλιμος , ὅστις ὡς πρὸς οἶδα συγγράφω.
""" # noqa

def __init__(self, text, options={}):
options['language'] = 'greek'
super().__init__(text=text, options=options)
options = {
'encoding': 'utf-8',
'language': 'greek'
}

def normalize(self):
"""Fixes problems with differences in greek accent encoding.
Expand Down
17 changes: 6 additions & 11 deletions dhelp/text/nltk.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class NLTKMixin:
"""

@classmethod
def setup(self, lang_pkgs_info):
def setup(cls):
"""Download NLTK packages and trainer corpora.
Launches the NLTK package download interface. Method is invoked by
Expand All @@ -42,7 +42,7 @@ def setup(self, lang_pkgs_info):
# (1) pkg name (2) list of path segs where pkg data is stored locally
pkgs_and_path_segments = settings.NLTK_PACKAGES['all']
# join common list with language specific packages
for package_info in lang_pkgs_info:
for package_info in settings.NLTK_PACKAGES[cls.options['language']]:
pkgs_and_path_segments.append(package_info)
# loop through list of tuples, each with pkg name and path info
for package, package_path_segments in pkgs_and_path_segments:
Expand Down Expand Up @@ -269,12 +269,7 @@ class EnglishText(NLTKMixin, BaseText):
>>> english_text.rm_lines().rm_nonchars().rm_spaces()
The quick brown fox jumped over the lazy dog
""" # noqa

def __init__(self, text, options={}):
options['language'] = 'english'
super().__init__(text=text, options=options)

@classmethod
def setup(self):
# invoke parent setup method, sending it the pkg info for specific lang
super(self.__class__).setup(settings.NLTK_PACKAGES['english'])
options = {
'encoding': 'utf-8',
'language': 'english'
}

0 comments on commit d786954

Please sign in to comment.