fix: Unable to download some files from AliyunOSS

Fixes: #267, #275, #285 Signed-off-by: yangxuan <xuan.yang@zilliz.com>
zilliztech · Mar 4, 2024 · 88b6989 · 88b6989
1 parent 9325a5a
commit 88b6989
Show file tree

Hide file tree

Showing 6 changed files with 150 additions and 164 deletions.
diff --git a/tests/test_data_source.py b/tests/test_data_source.py
@@ -1,78 +1,28 @@
 import logging
-import pathlib
 import pytest
-from vectordb_bench.backend.data_source import AliyunOSSReader, AwsS3Reader
-from vectordb_bench.backend.dataset import Dataset, DatasetManager
+from vectordb_bench.backend.data_source import DatasetSource
+from vectordb_bench.backend.cases import type2case
 
-log = logging.getLogger(__name__)
+log = logging.getLogger("vectordb_bench")
 
 class TestReader:
-    @pytest.mark.parametrize("size", [
-        100_000,
-        1_000_000,
-        10_000_000,
+    @pytest.mark.parametrize("type_case", [
+        (k, v) for k, v in type2case.items()
     ])
-    def test_cohere(self, size):
-        cohere = Dataset.COHERE.manager(size)
-        self.per_dataset_test(cohere)
+    def test_type_cases(self, type_case):
+        self.per_case_test(type_case)
 
-    @pytest.mark.parametrize("size", [
-        100_000,
-        1_000_000,
-    ])
-    def test_gist(self, size):
-        gist = Dataset.GIST.manager(size)
-        self.per_dataset_test(gist)
-
-    @pytest.mark.parametrize("size", [
-        1_000_000,
-    ])
-    def test_glove(self, size):
-        glove = Dataset.GLOVE.manager(size)
-        self.per_dataset_test(glove)
-
-    @pytest.mark.parametrize("size", [
-        500_000,
-        5_000_000,
-        #  50_000_000,
-    ])
-    def test_sift(self, size):
-        sift = Dataset.SIFT.manager(size)
-        self.per_dataset_test(sift)
-
-    @pytest.mark.parametrize("size", [
-        50_000,
-        500_000,
-        5_000_000,
-    ])
-    def test_openai(self, size):
-        openai = Dataset.OPENAI.manager(size)
-        self.per_dataset_test(openai)
-
-
-    def per_dataset_test(self, dataset: DatasetManager):
-        s3_reader = AwsS3Reader()
-        all_files = s3_reader.ls_all(dataset.data.dir_name)
-
-
-        remote_f_names = []
-        for file in all_files:
-            remote_f = pathlib.Path(file).name
-            if dataset.data.use_shuffled and remote_f.startswith("train"):
-                continue
-
-            elif (not dataset.data.use_shuffled) and remote_f.startswith("shuffle"):
-                continue
-
-            remote_f_names.append(remote_f)
 
+    def per_case_test(self, type_case):
+        t, ca_cls = type_case
+        ca = ca_cls()
+        log.info(f"test case: {t.name}, {ca.name}")
 
-        assert set(dataset.data.files) == set(remote_f_names)
+        filters = ca.filter_rate
+        ca.dataset.prepare(source=DatasetSource.AliyunOSS, check=False, filters=filters)
+        ali_trains = ca.dataset.train_files
 
-        aliyun_reader = AliyunOSSReader()
-        for fname in dataset.data.files:
-            p = pathlib.Path("benchmark", dataset.data.dir_name, fname)
-            assert aliyun_reader.bucket.object_exists(p.as_posix())
+        ca.dataset.prepare(check=False, filters=filters)
+        s3_trains = ca.dataset.train_files
 
-        log.info(f"downloading to {dataset.data_dir}")
-        aliyun_reader.read(dataset.data.dir_name.lower(), dataset.data.files, dataset.data_dir)
+        assert ali_trains == s3_trains
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -1,7 +1,8 @@
-from vectordb_bench.backend.dataset import Dataset, get_files
+from vectordb_bench.backend.dataset import Dataset
 import logging
 import pytest
 from pydantic import ValidationError
+from vectordb_bench.backend.data_source import DatasetSource
 
 
 log = logging.getLogger("vectordb_bench")
@@ -35,34 +36,31 @@ def test_iter_cohere(self):
         dur_iter = time.time() - before
         log.warning(f"iter through cohere_10m cost={dur_iter/60}min")
 
+    # pytest -sv tests/test_dataset.py::TestDataSet::test_iter_laion 
+    def test_iter_laion(self):
+        laion_100m = Dataset.LAION.manager(100_000_000)
+        from vectordb_bench.backend.data_source import DatasetSource
+        laion_100m.prepare(source=DatasetSource.AliyunOSS, check=False)
 
-class TestGetFiles:
-    @pytest.mark.parametrize("train_count", [
-        1,
-        10,
-        50,
-        100,
-    ])
-    @pytest.mark.parametrize("with_gt", [True, False])
-    def test_train_count(self, train_count, with_gt):
-        files = get_files(train_count, True, with_gt)
-        log.info(files)
+        import time
+        before = time.time()
+        for i in laion_100m:
+            log.debug(i.head(1))
 
-        if with_gt:
-            assert len(files) - 4 == train_count
-        else:
-            assert len(files) - 1 == train_count
+        dur_iter = time.time() - before
+        log.warning(f"iter through laion_100m cost={dur_iter/60}min")
 
-    @pytest.mark.parametrize("use_shuffled", [True, False])
-    def test_use_shuffled(self, use_shuffled):
-        files = get_files(1, use_shuffled, True)
-        log.info(files)
+    # https://github.com/zilliztech/VectorDBBench/issues/285
+    # TODO: ok
+    def test_iter_openai(self):
+
+        openai_500k = Dataset.OPENAI.manager(500_000)
+        openai_500k.prepare(source=DatasetSource.AliyunOSS, check=False)
 
-        trains = [f for f in files if "train" in f]
-        if use_shuffled:
-            for t in trains:
-                assert "shuffle_train" in t
-        else:
-            for t in trains:
-                assert "shuffle" not in t
-                assert "train" in t
+        import time
+        before = time.time()
+        for i in openai_500k:
+            log.debug(i.head(1))
+
+        dur_iter = time.time() - before
+        log.warning(f"iter through openai 500K cost={dur_iter/60}min, source=AliyunOSS")
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -37,3 +37,30 @@ def test_recall(self, got_expected):
         log.info(f"recall: {res}, expected: {expected}")
         assert res == expected
 
+
+class TestGetFiles:
+    @pytest.mark.parametrize("train_count", [
+        1,
+        10,
+        50,
+        100,
+    ])
+    def test_train_count(self, train_count):
+        files = utils.compose_train_files(train_count, True)
+        log.info(files)
+
+        assert len(files) == train_count
+
+    @pytest.mark.parametrize("use_shuffled", [True, False])
+    def test_use_shuffled(self, use_shuffled):
+        files = utils.compose_train_files(1, use_shuffled)
+        log.info(files)
+
+        trains = [f for f in files if "train" in f]
+        if use_shuffled:
+            for t in trains:
+                assert "shuffle_train" in t
+        else:
+            for t in trains:
+                assert "shuffle" not in t
+                assert "train" in t