Skip to content

Commit b724441

Browse files
committed
Update tokenizing_test.py
1 parent 2766bd6 commit b724441

File tree

1 file changed

+31
-31
lines changed

1 file changed

+31
-31
lines changed

tokenizing_test.py

Lines changed: 31 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -42,37 +42,37 @@ def preprocess_function(examples):
4242

4343
if __name__ == "__main__":
4444
#data_path = "data/prot_total/prot_total.txt"
45-
#path = "data/prot_minimal/prot_minimal.txt"
46-
#main(data_path)
47-
48-
49-
50-
paths = ["/mnt/home/mheinzinger/deepppi1tb/ProSST5/martins_set/data_mixed/train.csv",
51-
"/mnt/home/mheinzinger/deepppi1tb/ProSST5/martins_set/data_mixed/test.csv",
52-
"/mnt/home/mheinzinger/deepppi1tb/ProSST5/martins_set/data_mixed/val.csv"]
53-
54-
print("saved paths")
55-
56-
57-
58-
59-
60-
print("load datasets as raw")
61-
raw_datasets = datasets.load_dataset("csv", data_files=paths)
62-
63-
print("process datasets")
64-
print(type(raw_datasets))
65-
print(len(raw_datasets))
66-
processed_datasets = raw_datasets.map(
67-
preprocess_function,
68-
batched=True,
69-
num_proc=1,
70-
remove_columns=[],
71-
load_from_cache_file=True,
72-
desc="Running tokenizer on dataset",
73-
)
74-
75-
print(raw_datasets)
45+
path = "data/prot_minimal/prot_minimal.txt"
46+
main(data_path)
47+
48+
49+
50+
# paths = ["/mnt/home/mheinzinger/deepppi1tb/ProSST5/martins_set/data_mixed/train.csv",
51+
# "/mnt/home/mheinzinger/deepppi1tb/ProSST5/martins_set/data_mixed/test.csv",
52+
# "/mnt/home/mheinzinger/deepppi1tb/ProSST5/martins_set/data_mixed/val.csv"]
53+
#
54+
# print("saved paths")
55+
#
56+
#
57+
#
58+
#
59+
#
60+
# print("load datasets as raw")
61+
# raw_datasets = datasets.load_dataset("csv", data_files=paths)
62+
#
63+
# print("process datasets")
64+
# print(type(raw_datasets))
65+
# print(len(raw_datasets))
66+
# processed_datasets = raw_datasets.map(
67+
# preprocess_function,
68+
# batched=True,
69+
# num_proc=1,
70+
# remove_columns=[],
71+
# load_from_cache_file=True,
72+
# desc="Running tokenizer on dataset",
73+
# )
74+
#
75+
# print(raw_datasets)
7676

7777

7878

0 commit comments

Comments
 (0)