Skip to content

Commit

Permalink
concat to samples
Browse files Browse the repository at this point in the history
  • Loading branch information
undertherain committed Aug 20, 2021
1 parent 17a422e commit 7cb3790
Showing 1 changed file with 19 additions and 5 deletions.
24 changes: 19 additions & 5 deletions vecto/corpus/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# import numpy as np
# from nltk.tokenize import sent_tokenize
# import nltk
from transformers import AutoTokenizer
from vecto.corpus import Corpus


Expand Down Expand Up @@ -49,10 +50,10 @@ def sentence_iter(char_iter):
# print(prev_token)
if not is_abbreviation(prev_token[:-1]):
is_sentence_end = True
if c in other_delimiters:
if prev_char in other_delimiters and c != "\"":
is_sentence_end = True
buffer[pos] = c
pos += 1
#buffer[pos] = c
#pos += 1
if is_sentence_end:
if pos > 0:
yield "".join(buffer[: pos]).strip()
Expand Down Expand Up @@ -84,14 +85,27 @@ def main():
# print(tokenized)
path = "./tests/data/corpora/sentencise"
path = "/mnt/storage/Data/NLP/corpora/wiki_clean.txt"
path = "/mnt/storage/Data/NLP/corpora/toronto_clean.txt"
path = "./quotes/13th_Reality-1.txt"
name_tokenizer = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(name_tokenizer)
corpus = Corpus(path)
corpus.load_dir_strucute()
char_iter = corpus.get_character_iterator()
sent_iter = sentence_iter(char_iter)
cnt = 0
sample = []
max_length = 128
for line in sent_iter:
print(line)
print()
tokens = tokenizer(line, return_attention_mask=False)["input_ids"]
if len(sample) + len(tokens) > max_length:
sample = sample[:max_length]
print(len(sample))
sample = []
sample += tokens
# print(tokenizer.convert_ids_to_tokens(tokens))
# print(line)
# print()
if cnt > 100:
break
cnt += 1
Expand Down

0 comments on commit 7cb3790

Please sign in to comment.