Skip to content

Commit c40a519

Browse files
committed
Update tokenizing_test.py
1 parent d2a2dbb commit c40a519

File tree

1 file changed

+7
-2
lines changed

1 file changed

+7
-2
lines changed

tokenizing_test.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,17 @@ def main(data_path: str):
2424
print(df.info())
2525
print("Converting data to list")
2626
text = df[0].apply(lambda x: x.strip()).tolist()
27+
28+
for i, t in enumerate(text):
29+
if i % 100_000 == 0:
30+
print(f"At {i}, {len(text) - i} to go.")
31+
tokenizer(t)
2732

2833
print(sys.getsizeof(text)/8/1_000_000)
2934
print("encoding whole dataset")
30-
encoded = tokenizer(text=text, max_length=512, padding=True, truncation=True)
35+
#encoded = tokenizer(text=text, max_length=512, padding=True, truncation=True)
3136
print("encoding done")
32-
print(f"{sys.getsizeof(encoded)}")
37+
#print(f"{sys.getsizeof(encoded)}")
3338

3439

3540
def preprocess_function(examples):

0 commit comments

Comments
 (0)