Merge d786954 into 11235f8

thePortus · May 24, 2018 · b35297c · b35297c
2 parents 11235f8 + d786954
commit b35297c
Show file tree

Hide file tree

Showing 6 changed files with 52 additions and 29 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -12,7 +12,7 @@ os: linux
 dist: trusty
 # set environment variables
 env:
-  - PACKAGE_VERSION=0.0.4
+  - PACKAGE_VERSION=0.0.5
 # install dependencies
 install:
   - pip install -r requirements/dev.txt

diff --git a/dhelp/settings.py b/dhelp/settings.py
@@ -11,10 +11,11 @@
 ]
 
 NLTK_PACKAGES = {
-    'english': [
+    'all': [
+        ('punkt', ['tokenizers', 'punkt.zip']),
         ('verbnet', ['corpora', 'verbnet.zip']),
         ('wordnet', ['corpora', 'wordnet.zip']),
-        ('words', ['corpora', 'words.zip']),
+        ('large_grammars', ['grammars', 'large_grammars.zip']),
         ('large_grammars', ['grammars', 'large_grammars.zip']),
         (
             'averaged_perceptron_tagger',
@@ -24,12 +25,24 @@
             'maxent_treebank_pos_tagger',
             ['taggers', 'maxent_treebank_pos_tagger.zip']
         ),
-        ('universal_tagset', ['taggers', 'universal_tagset.zip']),
-        ('punkt', ['tokenizers', 'punkt.zip']),
         ('maxent_ne_chunker', ['chunkers', 'maxent_ne_chunker.zip']),
+        ('universal_tagset', ['taggers', 'universal_tagset.zip']),
+    ],
+    'english': [
+        ('words', ['corpora', 'words.zip']),
+        ('sample_grammars', ['grammars', 'sample_grammars.zip']),
+        ('book_grammars', ['grammars', 'book_grammars.zip']),
+        ('perluniprops', ['misc', 'perluniprops.zip'])
+    ],
+    'spanish': [
+        ('spanish_grammars', ['grammars', 'spanish_grammars.zip'])
+    ],
+    'basque': [
+        ('basque_grammars', ['grammars', 'basque_grammars.zip'])
     ]
 }
 
+# TODO: Change CLTK setup so it expects path segments like NLTK settings
 CLTK_PACKAGES = {
     'greek': [
         ('greek_software_tlgu', 'software/greek_software_tlgu'),

diff --git a/dhelp/text/_bases.py b/dhelp/text/_bases.py
@@ -21,15 +21,18 @@ class BaseText(UserString):
         >>> print(text)
         'Lorem ipsum dolor sit amet...'
     """ # noqa
+    options = {
+        'encoding': 'utf-8',
+        'language': 'english'
+    }
 
-    def __init__(self, text, options={}):
+    def __init__(self, text, *args, **kwargs):
         super().__init__(str)
-        if 'encoding' not in options:
-            options['encoding'] = 'utf-8'
-        if 'language' not in options:
-            options['language'] = 'english'
+        # update .options if options keyword arg passed
+        if 'options' in kwargs:
+            if type(kwargs['options']) == dict:
+                self.options.update(kwargs['options'])
         self.data = text
-        self.options = options
 
     def __enter__(self):
         pass

diff --git a/dhelp/text/cltk.py b/dhelp/text/cltk.py
@@ -256,10 +256,10 @@ class LatinText(CLTKMixin, BaseText):
         >>> print(text.lemmatize())
         gallia edo1 omne divido in pars tres
     """
-
-    def __init__(self, text, options={}):
-        options['language'] = 'latin'
-        super().__init__(text=text, options=options)
+    options = {
+        'encoding': 'utf-8',
+        'language': 'latin'
+    }
 
     def macronize(self, mode='tag_ngram_123_backoff'):
         """Adds macrons (long vowel marks).
@@ -366,10 +366,10 @@ class AncientGreekText(CLTKMixin, BaseText):
         >>> print(text.lemmatize())
         εἰμί δὲ σύμπας οὗτος τὰ σύγγραμμα ἐκεῖνος μάλιστα οὐ ὠφέλιμος , ὅστις ὡς πρὸς οἶδα συγγράφω.
     """ # noqa
-
-    def __init__(self, text, options={}):
-        options['language'] = 'greek'
-        super().__init__(text=text, options=options)
+    options = {
+        'encoding': 'utf-8',
+        'language': 'greek'
+    }
 
     def normalize(self):
         """Fixes problems with differences in greek accent encoding.

diff --git a/dhelp/text/nltk.py b/dhelp/text/nltk.py
@@ -26,20 +26,27 @@ class NLTKMixin:
     """
 
     @classmethod
-    def setup(self):
+    def setup(cls):
         """Download NLTK packages and trainer corpora.
 
-        Launches the NLTK package download interface. Overridden by the CLTK
+        Launches the NLTK package download interface. Method is invoked by
+        child .setup() methods in NLTK classes. Method is overidden in CLTK
         child classes to launch the automated CLTK downloader. Convenience
         method if user has not already downloaded NLTK packages and trainer
         sets.
 
         Example:
             >>> EnglishText.setup()
         """
-        for package, package_path_segments in settings.NLTK_PACKAGES[
-            'english'
-        ]:
+        # start with common pkgs, a list of tuples each with...
+        # (1) pkg name (2) list of path segs where pkg data is stored locally
+        pkgs_and_path_segments = settings.NLTK_PACKAGES['all']
+        # join common list with language specific packages
+        for package_info in settings.NLTK_PACKAGES[cls.options['language']]:
+            pkgs_and_path_segments.append(package_info)
+        # loop through list of tuples, each with pkg name and path info
+        for package, package_path_segments in pkgs_and_path_segments:
+            # build the relative filepath to the data, specific to the os
             package_path = os.sep.join(package_path_segments)
             # will trigger error if no file, if file found, do nothing
             try:
@@ -262,7 +269,7 @@ class EnglishText(NLTKMixin, BaseText):
         >>> english_text.rm_lines().rm_nonchars().rm_spaces()
         The quick brown fox jumped over the lazy dog
     """ # noqa
-
-    def __init__(self, text, options={}):
-        options['language'] = 'english'
-        super().__init__(text=text, options=options)
+    options = {
+        'encoding': 'utf-8',
+        'language': 'english'
+    }
diff --git a/setup.py b/setup.py
@@ -11,7 +11,7 @@
     'author_email': 'dave.a.base@gmail.com',
     'description': """DH Python tools for scraping web pages, pre-processing
     data, and performing nlp analysis quickly.""",
-    'version': '0.0.4',
+    'version': '0.0.5',
     'LICENSE': 'MIT',
     'long_description': """Students often see great potential in Python for
     historical analysis. But, before they see real payoff they often face too