Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Unable to download some files from AliyunOSS #287

Merged
merged 1 commit into from
Mar 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 17 additions & 67 deletions tests/test_data_source.py
Original file line number Diff line number Diff line change
@@ -1,78 +1,28 @@
import logging
import pathlib
import pytest
from vectordb_bench.backend.data_source import AliyunOSSReader, AwsS3Reader
from vectordb_bench.backend.dataset import Dataset, DatasetManager
from vectordb_bench.backend.data_source import DatasetSource
from vectordb_bench.backend.cases import type2case

log = logging.getLogger(__name__)
log = logging.getLogger("vectordb_bench")

class TestReader:
@pytest.mark.parametrize("size", [
100_000,
1_000_000,
10_000_000,
@pytest.mark.parametrize("type_case", [
(k, v) for k, v in type2case.items()
])
def test_cohere(self, size):
cohere = Dataset.COHERE.manager(size)
self.per_dataset_test(cohere)
def test_type_cases(self, type_case):
self.per_case_test(type_case)

@pytest.mark.parametrize("size", [
100_000,
1_000_000,
])
def test_gist(self, size):
gist = Dataset.GIST.manager(size)
self.per_dataset_test(gist)

@pytest.mark.parametrize("size", [
1_000_000,
])
def test_glove(self, size):
glove = Dataset.GLOVE.manager(size)
self.per_dataset_test(glove)

@pytest.mark.parametrize("size", [
500_000,
5_000_000,
# 50_000_000,
])
def test_sift(self, size):
sift = Dataset.SIFT.manager(size)
self.per_dataset_test(sift)

@pytest.mark.parametrize("size", [
50_000,
500_000,
5_000_000,
])
def test_openai(self, size):
openai = Dataset.OPENAI.manager(size)
self.per_dataset_test(openai)


def per_dataset_test(self, dataset: DatasetManager):
s3_reader = AwsS3Reader()
all_files = s3_reader.ls_all(dataset.data.dir_name)


remote_f_names = []
for file in all_files:
remote_f = pathlib.Path(file).name
if dataset.data.use_shuffled and remote_f.startswith("train"):
continue

elif (not dataset.data.use_shuffled) and remote_f.startswith("shuffle"):
continue

remote_f_names.append(remote_f)

def per_case_test(self, type_case):
t, ca_cls = type_case
ca = ca_cls()
log.info(f"test case: {t.name}, {ca.name}")

assert set(dataset.data.files) == set(remote_f_names)
filters = ca.filter_rate
ca.dataset.prepare(source=DatasetSource.AliyunOSS, check=False, filters=filters)
ali_trains = ca.dataset.train_files

aliyun_reader = AliyunOSSReader()
for fname in dataset.data.files:
p = pathlib.Path("benchmark", dataset.data.dir_name, fname)
assert aliyun_reader.bucket.object_exists(p.as_posix())
ca.dataset.prepare(check=False, filters=filters)
s3_trains = ca.dataset.train_files

log.info(f"downloading to {dataset.data_dir}")
aliyun_reader.read(dataset.data.dir_name.lower(), dataset.data.files, dataset.data_dir)
assert ali_trains == s3_trains
54 changes: 26 additions & 28 deletions tests/test_dataset.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from vectordb_bench.backend.dataset import Dataset, get_files
from vectordb_bench.backend.dataset import Dataset
import logging
import pytest
from pydantic import ValidationError
from vectordb_bench.backend.data_source import DatasetSource


log = logging.getLogger("vectordb_bench")
Expand Down Expand Up @@ -35,34 +36,31 @@ def test_iter_cohere(self):
dur_iter = time.time() - before
log.warning(f"iter through cohere_10m cost={dur_iter/60}min")

# pytest -sv tests/test_dataset.py::TestDataSet::test_iter_laion
def test_iter_laion(self):
laion_100m = Dataset.LAION.manager(100_000_000)
from vectordb_bench.backend.data_source import DatasetSource
laion_100m.prepare(source=DatasetSource.AliyunOSS, check=False)

class TestGetFiles:
@pytest.mark.parametrize("train_count", [
1,
10,
50,
100,
])
@pytest.mark.parametrize("with_gt", [True, False])
def test_train_count(self, train_count, with_gt):
files = get_files(train_count, True, with_gt)
log.info(files)
import time
before = time.time()
for i in laion_100m:
log.debug(i.head(1))

if with_gt:
assert len(files) - 4 == train_count
else:
assert len(files) - 1 == train_count
dur_iter = time.time() - before
log.warning(f"iter through laion_100m cost={dur_iter/60}min")

@pytest.mark.parametrize("use_shuffled", [True, False])
def test_use_shuffled(self, use_shuffled):
files = get_files(1, use_shuffled, True)
log.info(files)
# https://github.com/zilliztech/VectorDBBench/issues/285
# TODO: ok
def test_iter_openai(self):

openai_500k = Dataset.OPENAI.manager(500_000)
openai_500k.prepare(source=DatasetSource.AliyunOSS, check=False)

trains = [f for f in files if "train" in f]
if use_shuffled:
for t in trains:
assert "shuffle_train" in t
else:
for t in trains:
assert "shuffle" not in t
assert "train" in t
import time
before = time.time()
for i in openai_500k:
log.debug(i.head(1))

dur_iter = time.time() - before
log.warning(f"iter through openai 500K cost={dur_iter/60}min, source=AliyunOSS")
27 changes: 27 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,30 @@ def test_recall(self, got_expected):
log.info(f"recall: {res}, expected: {expected}")
assert res == expected


class TestGetFiles:
@pytest.mark.parametrize("train_count", [
1,
10,
50,
100,
])
def test_train_count(self, train_count):
files = utils.compose_train_files(train_count, True)
log.info(files)

assert len(files) == train_count

@pytest.mark.parametrize("use_shuffled", [True, False])
def test_use_shuffled(self, use_shuffled):
files = utils.compose_train_files(1, use_shuffled)
log.info(files)

trains = [f for f in files if "train" in f]
if use_shuffled:
for t in trains:
assert "shuffle_train" in t
else:
for t in trains:
assert "shuffle" not in t
assert "train" in t
Loading