add lazy imports to dataset_utils.py #9052
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Unittests | |
on: | |
workflow_dispatch: | |
pull_request: | |
branches: | |
- master | |
# Do not trigger tests for documentation or markdown docs. | |
paths-ignore: | |
- 'docs/**' | |
- '*.md' | |
push: | |
branches: | |
- master | |
# Do not trigger tests for documentation or markdown docs. | |
paths-ignore: | |
- 'docs/**' | |
- '*.md' | |
schedule: | |
# Trigger tests every day at 02:00 UTC to refresh cache. | |
- cron: '0 2 * * *' | |
# Cancel in-progress runs for the current workflow if not on the main branch | |
# (as it mark the unittests as failed). | |
# Conditionals to concurrent are based on the solution proposed in this link: | |
# https://github.community/t/concurrency-cancel-in-progress-but-not-when-ref-is-master/194707 | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref != 'refs/heads/master' || github.run_number }} | |
# Cancel only PR intermediate builds | |
cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} | |
env: | |
PYTEST_NUM_SHARDS: 4 # Controls tests sharding enabled by `pytest-shard` | |
jobs: | |
activate-tests: | |
name: Check if tests should be run | |
runs-on: ubuntu-latest | |
steps: | |
- name: Check | |
id: check | |
# For merged PR, activate testing only on the master branch, based on: | |
# https://github.community/t/trigger-workflow-only-on-pull-request-merge/17359 | |
run: | | |
echo "status=${{ github.ref == 'refs/heads/master' || ( | |
github.event.action != 'closed' | |
&& github.event.pull_request.merged == false | |
) }}" >> $GITHUB_OUTPUT | |
outputs: | |
status: ${{ steps.check.outputs.status }} | |
shards-job: | |
needs: activate-tests | |
if: ${{ needs.activate-tests.outputs.status }} | |
name: Generate shards | |
runs-on: ubuntu-latest | |
steps: | |
- name: Create variables | |
id: create-vars | |
run: | | |
echo "num-shards=$(jq -n -c '[${{ env.PYTEST_NUM_SHARDS }}]')" >> $GITHUB_OUTPUT | |
echo "shard-ids=$(jq -n -c '[range(1;${{ env.PYTEST_NUM_SHARDS }}+1)]')" >> $GITHUB_OUTPUT | |
outputs: | |
num-shards: ${{ steps.create-vars.outputs.num-shards }} | |
shard-ids: ${{ steps.create-vars.outputs.shard-ids }} | |
pytest-job: | |
needs: shards-job | |
name: '[${{ matrix.os-version }}][${{ matrix.tf-version }}][Python ${{ matrix.python-version }}][${{ matrix.shard-id }}/${{ matrix.num-shards }}] Core TFDS tests' | |
runs-on: ${{ matrix.os-version }} | |
timeout-minutes: 30 | |
strategy: | |
# Do not cancel in-progress jobs if any matrix job fails. | |
fail-fast: false | |
matrix: | |
tf-version: ['tensorflow'] | |
# Can't reference env variables in matrix | |
num-shards: ${{ fromJson(needs.shards-job.outputs.num-shards) }} | |
shard-id: ${{ fromJson(needs.shards-job.outputs.shard-ids) }} | |
# TF suppported versions: https://www.tensorflow.org/install/pip#software_requirements | |
python-version: ['3.10', '3.11', '3.12'] | |
os-version: [ubuntu-latest] | |
steps: | |
- uses: actions/checkout@v3 | |
- uses: ./.github/actions/setup | |
with: | |
tf-version: ${{ matrix.tf-version }} | |
python-version: ${{ matrix.python-version }} | |
# Run tests | |
# Ignores: | |
# * Nsynth is run in isolation due to dependency conflict (crepe). | |
# * Lsun tests is disabled because the tensorflow_io used in open-source | |
# is linked to static libraries compiled again specific TF version, which | |
# makes test fails with linking error (libtensorflow_io_golang.so). | |
# * imagenet2012_corrupted requires imagemagick binary. | |
# * import_without_tf_test.py, because the test relies on TensorFlow not being imported. | |
# * github_api is run separately to not overuse API quota. | |
# * wmt is run separately to avoid worker hanging. | |
# * Huggingface requires `datasets` library. | |
- name: Run core tests | |
run: | | |
pytest --durations=100 -vv -n auto --shard-id=$((${{ matrix.shard-id }} - 1)) --num-shards=${{ env.PYTEST_NUM_SHARDS }} \ | |
--ignore="tensorflow_datasets/datasets/nsynth/nsynth_dataset_builder_test.py" \ | |
--ignore="tensorflow_datasets/image/lsun_test.py" \ | |
--ignore="tensorflow_datasets/datasets/imagenet2012_corrupted/imagenet2012_corrupted_dataset_builder_test.py" \ | |
--ignore="tensorflow_datasets/scripts/documentation/build_api_docs_test.py" \ | |
--ignore="tensorflow_datasets/import_without_tf_test.py" \ | |
--ignore="tensorflow_datasets/core/github_api/github_path_test.py" \ | |
--ignore="tensorflow_datasets/translate/wmt19_test.py" \ | |
--ignore="tensorflow_datasets/core/dataset_builders/huggingface_dataset_builder_test.py" \ | |
--ignore="tensorflow_datasets/core/utils/huggingface_utils_test.py" | |
# Run tests without any pytest plugins. The tests should be triggered for a single shard only. | |
- name: Run leftover tests | |
if: ${{ matrix.shard-id == 1 }} | |
uses: nick-fields/retry@v2 | |
with: | |
timeout_minutes: 1 | |
max_attempts: 2 | |
retry_on: timeout | |
command: | | |
pytest -vv -o faulthandler_timeout=10 tensorflow_datasets/translate/wmt19_test.py | |
huggingface-pytest-job: | |
needs: activate-tests | |
if: ${{ needs.activate-tests.outputs.status }} | |
# HuggingFace tests need to be run separately because they're disabled without installed | |
# `datasets` library. | |
name: 'HuggingFace Python 3.10 tests' | |
runs-on: ubuntu-latest | |
timeout-minutes: 30 | |
steps: | |
- uses: actions/checkout@v3 | |
- uses: ./.github/actions/setup | |
with: | |
tf-version: tensorflow | |
python-version: '3.10' | |
extras: huggingface | |
- name: Run HuggingFace tests | |
run: | | |
pytest -vv -n auto \ | |
tensorflow_datasets/core/dataset_builders/huggingface_dataset_builder_test.py \ | |
tensorflow_datasets/core/utils/huggingface_utils_test.py | |
githubapi-pytest-job: | |
needs: activate-tests | |
if: ${{ needs.activate-tests.outputs.status }} | |
name: 'Github API tests' | |
runs-on: ubuntu-latest | |
timeout-minutes: 30 | |
steps: | |
- uses: actions/checkout@v3 | |
- uses: ./.github/actions/setup | |
with: | |
tf-version: tensorflow | |
- name: Run Github API tests | |
run: pytest --durations=100 -vv -n auto tensorflow_datasets/core/github_api/github_path_test.py | |
env: | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
notebook-test-job: | |
needs: activate-tests | |
if: ${{ needs.activate-tests.outputs.status }} | |
name: 'Notebook tests' | |
runs-on: ubuntu-latest | |
timeout-minutes: 30 | |
steps: | |
- uses: actions/checkout@v3 | |
- uses: ./.github/actions/setup | |
with: | |
tf-version: tensorflow | |
use-cache: false | |
# Test each notebook sequentially. | |
- name: Run notebook | |
run: | | |
ipython kernel install --user --name tfds-notebook | |
for notebook in docs/*ipynb | |
do | |
# These notebooks time out because they rely on loading huge datasets. | |
if [[ "$notebook" != "docs/determinism.ipynb" ]] && \ | |
[[ "$notebook" != "docs/dataset_collections.ipynb" ]] | |
then | |
jupyter nbconvert \ | |
--ExecutePreprocessor.timeout=600 \ | |
--ExecutePreprocessor.kernel_name=tfds-notebook \ | |
--to notebook \ | |
--execute $notebook && \ | |
pip install tensorflow # reinstall tensorflow if it was uninstalled | |
fi | |
done |