diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml new file mode 100644 index 000000000..7c133cd5f --- /dev/null +++ b/.github/workflows/pull_request.yml @@ -0,0 +1,36 @@ +name: Test on pull request + +on: + pull_request: + branches: + - main + +jobs: + build: + name: Run Python Tests + strategy: + matrix: + python-version: [3.11, 3.12] + os: [ubuntu-latest, windows-latest] + runs-on: ${{ matrix.os }} + + steps: + - name: Checkout code + uses: actions/checkout@v4 + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Fetch tags + run: | + git fetch --prune --unshallow --tags + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e ".[test]" + + - name: Test with pytest + run: | + make unittest diff --git a/Makefile b/Makefile new file mode 100644 index 000000000..562615f6d --- /dev/null +++ b/Makefile @@ -0,0 +1,2 @@ +unittest: + PYTHONPATH=`pwd` python3 -m pytest tests/test_dataset.py::TestDataSet::test_download_small -svv diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 60a219e25..c7678c206 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -36,7 +36,7 @@ def test_iter_cohere(self): dur_iter = time.time() - before log.warning(f"iter through cohere_10m cost={dur_iter/60}min") - # pytest -sv tests/test_dataset.py::TestDataSet::test_iter_laion + # pytest -sv tests/test_dataset.py::TestDataSet::test_iter_laion def test_iter_laion(self): laion_100m = Dataset.LAION.manager(100_000_000) from vectordb_bench.backend.data_source import DatasetSource @@ -50,17 +50,30 @@ def test_iter_laion(self): dur_iter = time.time() - before log.warning(f"iter through laion_100m cost={dur_iter/60}min") - # https://github.com/zilliztech/VectorDBBench/issues/285 - # TODO: ok - def test_iter_openai(self): - - openai_500k = Dataset.OPENAI.manager(500_000) - openai_500k.prepare(source=DatasetSource.AliyunOSS, check=False) + def test_download_small(self): + openai_50k = Dataset.OPENAI.manager(50_000) + files = [ + "test.parquet", + "neighbors.parquet", + "neighbors_head_1p.parquet", + "neighbors_tail_1p.parquet", + ] - import time - before = time.time() - for i in openai_500k: - log.debug(i.head(1)) + file_path = openai_50k.data_dir.joinpath("test.parquet") + import os + + DatasetSource.S3.reader().read( + openai_50k.data.dir_name.lower(), + files=files, + local_ds_root=openai_50k.data_dir, + check_etag=False, + ) + + os.remove(file_path) + DatasetSource.AliyunOSS.reader().read( + openai_50k.data.dir_name.lower(), + files=files, + local_ds_root=openai_50k.data_dir, + check_etag=False, + ) - dur_iter = time.time() - before - log.warning(f"iter through openai 500K cost={dur_iter/60}min, source=AliyunOSS") diff --git a/vectordb_bench/backend/data_source.py b/vectordb_bench/backend/data_source.py index 65926ff6b..28e3c3636 100644 --- a/vectordb_bench/backend/data_source.py +++ b/vectordb_bench/backend/data_source.py @@ -76,11 +76,11 @@ def read(self, dataset: str, files: list[str], local_ds_root: pathlib.Path, chec if not local_ds_root.exists(): log.info(f"local dataset root path not exist, creating it: {local_ds_root}") local_ds_root.mkdir(parents=True) - downloads = [(pathlib.Path("benchmark", dataset, f), local_ds_root.joinpath(f)) for f in files] + downloads = [(pathlib.PurePosixPath("benchmark", dataset, f), local_ds_root.joinpath(f)) for f in files] else: for file in files: - remote_file = pathlib.Path("benchmark", dataset, file) + remote_file = pathlib.PurePosixPath("benchmark", dataset, file) local_file = local_ds_root.joinpath(file) # Don't check etags for Dataset from Aliyun OSS @@ -93,8 +93,8 @@ def read(self, dataset: str, files: list[str], local_ds_root: pathlib.Path, chec log.info(f"Start to downloading files, total count: {len(downloads)}") for remote_file, local_file in tqdm(downloads): - log.debug(f"downloading file {remote_file} to {local_ds_root}") - self.bucket.get_object_to_file(remote_file.as_posix(), local_file.as_posix()) + log.debug(f"downloading file {remote_file} to {local_file}") + self.bucket.get_object_to_file(remote_file.as_posix(), local_file.absolute()) log.info(f"Succeed to download all files, downloaded file count = {len(downloads)}") @@ -125,11 +125,11 @@ def read(self, dataset: str, files: list[str], local_ds_root: pathlib.Path, chec if not local_ds_root.exists(): log.info(f"local dataset root path not exist, creating it: {local_ds_root}") local_ds_root.mkdir(parents=True) - downloads = [pathlib.Path(self.remote_root, dataset, f) for f in files] + downloads = [pathlib.PurePosixPath(self.remote_root, dataset, f) for f in files] else: for file in files: - remote_file = pathlib.Path(self.remote_root, dataset, file) + remote_file = pathlib.PurePosixPath(self.remote_root, dataset, file) local_file = local_ds_root.joinpath(file) if (not local_file.exists()) or (not self.validate_file(remote_file, local_file, check_etag)):