Skip to content

Commit

Permalink
fix: Unable to download some files from AliyunOSS
Browse files Browse the repository at this point in the history
Fixes: #267, #275, #285

Signed-off-by: yangxuan <xuan.yang@zilliz.com>
  • Loading branch information
XuanYang-cn committed Mar 4, 2024
1 parent 9325a5a commit 88b6989
Show file tree
Hide file tree
Showing 6 changed files with 150 additions and 164 deletions.
84 changes: 17 additions & 67 deletions tests/test_data_source.py
Original file line number Diff line number Diff line change
@@ -1,78 +1,28 @@
import logging
import pathlib
import pytest
from vectordb_bench.backend.data_source import AliyunOSSReader, AwsS3Reader
from vectordb_bench.backend.dataset import Dataset, DatasetManager
from vectordb_bench.backend.data_source import DatasetSource
from vectordb_bench.backend.cases import type2case

log = logging.getLogger(__name__)
log = logging.getLogger("vectordb_bench")

class TestReader:
@pytest.mark.parametrize("size", [
100_000,
1_000_000,
10_000_000,
@pytest.mark.parametrize("type_case", [
(k, v) for k, v in type2case.items()
])
def test_cohere(self, size):
cohere = Dataset.COHERE.manager(size)
self.per_dataset_test(cohere)
def test_type_cases(self, type_case):
self.per_case_test(type_case)

@pytest.mark.parametrize("size", [
100_000,
1_000_000,
])
def test_gist(self, size):
gist = Dataset.GIST.manager(size)
self.per_dataset_test(gist)

@pytest.mark.parametrize("size", [
1_000_000,
])
def test_glove(self, size):
glove = Dataset.GLOVE.manager(size)
self.per_dataset_test(glove)

@pytest.mark.parametrize("size", [
500_000,
5_000_000,
# 50_000_000,
])
def test_sift(self, size):
sift = Dataset.SIFT.manager(size)
self.per_dataset_test(sift)

@pytest.mark.parametrize("size", [
50_000,
500_000,
5_000_000,
])
def test_openai(self, size):
openai = Dataset.OPENAI.manager(size)
self.per_dataset_test(openai)


def per_dataset_test(self, dataset: DatasetManager):
s3_reader = AwsS3Reader()
all_files = s3_reader.ls_all(dataset.data.dir_name)


remote_f_names = []
for file in all_files:
remote_f = pathlib.Path(file).name
if dataset.data.use_shuffled and remote_f.startswith("train"):
continue

elif (not dataset.data.use_shuffled) and remote_f.startswith("shuffle"):
continue

remote_f_names.append(remote_f)

def per_case_test(self, type_case):
t, ca_cls = type_case
ca = ca_cls()
log.info(f"test case: {t.name}, {ca.name}")

assert set(dataset.data.files) == set(remote_f_names)
filters = ca.filter_rate
ca.dataset.prepare(source=DatasetSource.AliyunOSS, check=False, filters=filters)
ali_trains = ca.dataset.train_files

aliyun_reader = AliyunOSSReader()
for fname in dataset.data.files:
p = pathlib.Path("benchmark", dataset.data.dir_name, fname)
assert aliyun_reader.bucket.object_exists(p.as_posix())
ca.dataset.prepare(check=False, filters=filters)
s3_trains = ca.dataset.train_files

log.info(f"downloading to {dataset.data_dir}")
aliyun_reader.read(dataset.data.dir_name.lower(), dataset.data.files, dataset.data_dir)
assert ali_trains == s3_trains
54 changes: 26 additions & 28 deletions tests/test_dataset.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from vectordb_bench.backend.dataset import Dataset, get_files
from vectordb_bench.backend.dataset import Dataset
import logging
import pytest
from pydantic import ValidationError
from vectordb_bench.backend.data_source import DatasetSource


log = logging.getLogger("vectordb_bench")
Expand Down Expand Up @@ -35,34 +36,31 @@ def test_iter_cohere(self):
dur_iter = time.time() - before
log.warning(f"iter through cohere_10m cost={dur_iter/60}min")

# pytest -sv tests/test_dataset.py::TestDataSet::test_iter_laion
def test_iter_laion(self):
laion_100m = Dataset.LAION.manager(100_000_000)
from vectordb_bench.backend.data_source import DatasetSource
laion_100m.prepare(source=DatasetSource.AliyunOSS, check=False)

class TestGetFiles:
@pytest.mark.parametrize("train_count", [
1,
10,
50,
100,
])
@pytest.mark.parametrize("with_gt", [True, False])
def test_train_count(self, train_count, with_gt):
files = get_files(train_count, True, with_gt)
log.info(files)
import time
before = time.time()
for i in laion_100m:
log.debug(i.head(1))

if with_gt:
assert len(files) - 4 == train_count
else:
assert len(files) - 1 == train_count
dur_iter = time.time() - before
log.warning(f"iter through laion_100m cost={dur_iter/60}min")

@pytest.mark.parametrize("use_shuffled", [True, False])
def test_use_shuffled(self, use_shuffled):
files = get_files(1, use_shuffled, True)
log.info(files)
# https://github.com/zilliztech/VectorDBBench/issues/285
# TODO: ok
def test_iter_openai(self):

openai_500k = Dataset.OPENAI.manager(500_000)
openai_500k.prepare(source=DatasetSource.AliyunOSS, check=False)

trains = [f for f in files if "train" in f]
if use_shuffled:
for t in trains:
assert "shuffle_train" in t
else:
for t in trains:
assert "shuffle" not in t
assert "train" in t
import time
before = time.time()
for i in openai_500k:
log.debug(i.head(1))

dur_iter = time.time() - before
log.warning(f"iter through openai 500K cost={dur_iter/60}min, source=AliyunOSS")
27 changes: 27 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,30 @@ def test_recall(self, got_expected):
log.info(f"recall: {res}, expected: {expected}")
assert res == expected


class TestGetFiles:
@pytest.mark.parametrize("train_count", [
1,
10,
50,
100,
])
def test_train_count(self, train_count):
files = utils.compose_train_files(train_count, True)
log.info(files)

assert len(files) == train_count

@pytest.mark.parametrize("use_shuffled", [True, False])
def test_use_shuffled(self, use_shuffled):
files = utils.compose_train_files(1, use_shuffled)
log.info(files)

trains = [f for f in files if "train" in f]
if use_shuffled:
for t in trains:
assert "shuffle_train" in t
else:
for t in trains:
assert "shuffle" not in t
assert "train" in t

0 comments on commit 88b6989

Please sign in to comment.