Skip to content

Commit

Permalink
not sentence end after one letter token heuristic
Browse files Browse the repository at this point in the history
  • Loading branch information
undertherain committed Aug 16, 2021
1 parent 771b878 commit 17a422e
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 5 deletions.
13 changes: 8 additions & 5 deletions vecto/corpus/iterators.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,14 @@ def __init__(self, dirname, verbose=0):
self.dirname = dirname

def _generate_samples(self):
for root, _, files in os.walk(self.dirname, followlinks=True):
for good_fname in sorted(fnmatch.filter(files, "*")):
full_file_path = os.path.join(root, good_fname)
logger.info("processing " + full_file_path)
yield full_file_path
if os.path.isfile(self.dirname):
yield self.dirname
else:
for root, _, files in os.walk(self.dirname, followlinks=True):
for good_fname in sorted(fnmatch.filter(files, "*")):
full_file_path = os.path.join(root, good_fname)
logger.info("processing " + full_file_path)
yield full_file_path


class FileLineIterator(BaseIterator):
Expand Down
7 changes: 7 additions & 0 deletions vecto/corpus/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ def simple_char_iter(text):
def is_abbreviation(token):
if "." in token:
return True
if len(token) == 1:
return True
if token.lower() in known_abbreviations:
return True
return False
Expand Down Expand Up @@ -81,13 +83,18 @@ def main():
# tokenized = sentencize(s)
# print(tokenized)
path = "./tests/data/corpora/sentencise"
path = "/mnt/storage/Data/NLP/corpora/wiki_clean.txt"
corpus = Corpus(path)
corpus.load_dir_strucute()
char_iter = corpus.get_character_iterator()
sent_iter = sentence_iter(char_iter)
cnt = 0
for line in sent_iter:
print(line)
print()
if cnt > 100:
break
cnt += 1


if __name__ == "__main__":
Expand Down

0 comments on commit 17a422e

Please sign in to comment.