Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

enhance: Support for windows #290

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions .github/workflows/pull_request.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
name: Test on pull request

on:
pull_request:
branches:
- main

jobs:
build:
name: Run Python Tests
strategy:
matrix:
python-version: [3.11, 3.12]
os: [ubuntu-latest, windows-latest]
runs-on: ${{ matrix.os }}

steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Setup Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

- name: Fetch tags
run: |
git fetch --prune --unshallow --tags

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -e ".[test]"

- name: Test with pytest
run: |
make unittest
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
unittest:
PYTHONPATH=`pwd` python3 -m pytest tests/test_dataset.py::TestDataSet::test_download_small -svv
39 changes: 26 additions & 13 deletions tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def test_iter_cohere(self):
dur_iter = time.time() - before
log.warning(f"iter through cohere_10m cost={dur_iter/60}min")

# pytest -sv tests/test_dataset.py::TestDataSet::test_iter_laion
# pytest -sv tests/test_dataset.py::TestDataSet::test_iter_laion
def test_iter_laion(self):
laion_100m = Dataset.LAION.manager(100_000_000)
from vectordb_bench.backend.data_source import DatasetSource
Expand All @@ -50,17 +50,30 @@ def test_iter_laion(self):
dur_iter = time.time() - before
log.warning(f"iter through laion_100m cost={dur_iter/60}min")

# https://github.com/zilliztech/VectorDBBench/issues/285
# TODO: ok
def test_iter_openai(self):

openai_500k = Dataset.OPENAI.manager(500_000)
openai_500k.prepare(source=DatasetSource.AliyunOSS, check=False)
def test_download_small(self):
openai_50k = Dataset.OPENAI.manager(50_000)
files = [
"test.parquet",
"neighbors.parquet",
"neighbors_head_1p.parquet",
"neighbors_tail_1p.parquet",
]

import time
before = time.time()
for i in openai_500k:
log.debug(i.head(1))
file_path = openai_50k.data_dir.joinpath("test.parquet")
import os

DatasetSource.S3.reader().read(
openai_50k.data.dir_name.lower(),
files=files,
local_ds_root=openai_50k.data_dir,
check_etag=False,
)

os.remove(file_path)
DatasetSource.AliyunOSS.reader().read(
openai_50k.data.dir_name.lower(),
files=files,
local_ds_root=openai_50k.data_dir,
check_etag=False,
)

dur_iter = time.time() - before
log.warning(f"iter through openai 500K cost={dur_iter/60}min, source=AliyunOSS")
12 changes: 6 additions & 6 deletions vectordb_bench/backend/data_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,11 @@ def read(self, dataset: str, files: list[str], local_ds_root: pathlib.Path, chec
if not local_ds_root.exists():
log.info(f"local dataset root path not exist, creating it: {local_ds_root}")
local_ds_root.mkdir(parents=True)
downloads = [(pathlib.Path("benchmark", dataset, f), local_ds_root.joinpath(f)) for f in files]
downloads = [(pathlib.PurePosixPath("benchmark", dataset, f), local_ds_root.joinpath(f)) for f in files]

else:
for file in files:
remote_file = pathlib.Path("benchmark", dataset, file)
remote_file = pathlib.PurePosixPath("benchmark", dataset, file)
local_file = local_ds_root.joinpath(file)

# Don't check etags for Dataset from Aliyun OSS
Expand All @@ -93,8 +93,8 @@ def read(self, dataset: str, files: list[str], local_ds_root: pathlib.Path, chec

log.info(f"Start to downloading files, total count: {len(downloads)}")
for remote_file, local_file in tqdm(downloads):
log.debug(f"downloading file {remote_file} to {local_ds_root}")
self.bucket.get_object_to_file(remote_file.as_posix(), local_file.as_posix())
log.debug(f"downloading file {remote_file} to {local_file}")
self.bucket.get_object_to_file(remote_file.as_posix(), local_file.absolute())

log.info(f"Succeed to download all files, downloaded file count = {len(downloads)}")

Expand Down Expand Up @@ -125,11 +125,11 @@ def read(self, dataset: str, files: list[str], local_ds_root: pathlib.Path, chec
if not local_ds_root.exists():
log.info(f"local dataset root path not exist, creating it: {local_ds_root}")
local_ds_root.mkdir(parents=True)
downloads = [pathlib.Path(self.remote_root, dataset, f) for f in files]
downloads = [pathlib.PurePosixPath(self.remote_root, dataset, f) for f in files]

else:
for file in files:
remote_file = pathlib.Path(self.remote_root, dataset, file)
remote_file = pathlib.PurePosixPath(self.remote_root, dataset, file)
local_file = local_ds_root.joinpath(file)

if (not local_file.exists()) or (not self.validate_file(remote_file, local_file, check_etag)):
Expand Down