not sentence end after one letter token heuristic

vecto-ai · Aug 16, 2021 · 17a422e · 17a422e
1 parent 771b878
commit 17a422e
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 5 deletions.
diff --git a/vecto/corpus/iterators.py b/vecto/corpus/iterators.py
@@ -35,11 +35,14 @@ def __init__(self, dirname, verbose=0):
         self.dirname = dirname
 
     def _generate_samples(self):
-        for root, _, files in os.walk(self.dirname, followlinks=True):
-            for good_fname in sorted(fnmatch.filter(files, "*")):
-                full_file_path = os.path.join(root, good_fname)
-                logger.info("processing " + full_file_path)
-                yield full_file_path
+        if os.path.isfile(self.dirname):
+            yield self.dirname
+        else:
+            for root, _, files in os.walk(self.dirname, followlinks=True):
+                for good_fname in sorted(fnmatch.filter(files, "*")):
+                    full_file_path = os.path.join(root, good_fname)
+                    logger.info("processing " + full_file_path)
+                    yield full_file_path
 
 
 class FileLineIterator(BaseIterator):

diff --git a/vecto/corpus/preprocess.py b/vecto/corpus/preprocess.py
@@ -30,6 +30,8 @@ def simple_char_iter(text):
 def is_abbreviation(token):
     if "." in token:
         return True
+    if len(token) == 1:
+        return True
     if token.lower() in known_abbreviations:
         return True
     return False
@@ -81,13 +83,18 @@ def main():
     #     tokenized = sentencize(s)
     #     print(tokenized)
     path = "./tests/data/corpora/sentencise"
+    path = "/mnt/storage/Data/NLP/corpora/wiki_clean.txt"
     corpus = Corpus(path)
     corpus.load_dir_strucute()
     char_iter = corpus.get_character_iterator()
     sent_iter = sentence_iter(char_iter)
+    cnt = 0
     for line in sent_iter:
         print(line)
         print()
+        if cnt > 100:
+            break
+        cnt += 1
 
 
 if __name__ == "__main__":