add lazy imports to dataset_utils.py #9052

Workflow file for this run

	name: Unittests

	on:
	workflow_dispatch:
	pull_request:
	branches:
	- master
	# Do not trigger tests for documentation or markdown docs.
	paths-ignore:
	- 'docs/**'
	- '*.md'
	push:
	branches:
	- master
	# Do not trigger tests for documentation or markdown docs.
	paths-ignore:
	- 'docs/**'
	- '*.md'
	schedule:
	# Trigger tests every day at 02:00 UTC to refresh cache.
	- cron: '0 2 * * *'

	# Cancel in-progress runs for the current workflow if not on the main branch
	# (as it mark the unittests as failed).
	# Conditionals to concurrent are based on the solution proposed in this link:
	# https://github.community/t/concurrency-cancel-in-progress-but-not-when-ref-is-master/194707
	concurrency:
	group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref != 'refs/heads/master' \|\| github.run_number }}
	# Cancel only PR intermediate builds
	cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}

	env:
	PYTEST_NUM_SHARDS: 4 # Controls tests sharding enabled by `pytest-shard`

	jobs:
	activate-tests:
	name: Check if tests should be run
	runs-on: ubuntu-latest

	steps:
	- name: Check
	id: check
	# For merged PR, activate testing only on the master branch, based on:
	# https://github.community/t/trigger-workflow-only-on-pull-request-merge/17359
	run: \|
	echo "status=${{ github.ref == 'refs/heads/master' \|\| (
	github.event.action != 'closed'
	&& github.event.pull_request.merged == false
	) }}" >> $GITHUB_OUTPUT

	outputs:
	status: ${{ steps.check.outputs.status }}

	shards-job:
	needs: activate-tests
	if: ${{ needs.activate-tests.outputs.status }}

	name: Generate shards
	runs-on: ubuntu-latest

	steps:
	- name: Create variables
	id: create-vars
	run: \|
	echo "num-shards=$(jq -n -c '[${{ env.PYTEST_NUM_SHARDS }}]')" >> $GITHUB_OUTPUT
	echo "shard-ids=$(jq -n -c '[range(1;${{ env.PYTEST_NUM_SHARDS }}+1)]')" >> $GITHUB_OUTPUT

	outputs:
	num-shards: ${{ steps.create-vars.outputs.num-shards }}
	shard-ids: ${{ steps.create-vars.outputs.shard-ids }}

	pytest-job:
	needs: shards-job

	name: '[${{ matrix.os-version }}][${{ matrix.tf-version }}][Python ${{ matrix.python-version }}][${{ matrix.shard-id }}/${{ matrix.num-shards }}] Core TFDS tests'
	runs-on: ${{ matrix.os-version }}
	timeout-minutes: 30
	strategy:
	# Do not cancel in-progress jobs if any matrix job fails.
	fail-fast: false
	matrix:
	tf-version: ['tensorflow']
	# Can't reference env variables in matrix
	num-shards: ${{ fromJson(needs.shards-job.outputs.num-shards) }}
	shard-id: ${{ fromJson(needs.shards-job.outputs.shard-ids) }}
	# TF suppported versions: https://www.tensorflow.org/install/pip#software_requirements
	python-version: ['3.10', '3.11', '3.12']
	os-version: [ubuntu-latest]

	steps:
	- uses: actions/checkout@v3
	- uses: ./.github/actions/setup
	with:
	tf-version: ${{ matrix.tf-version }}
	python-version: ${{ matrix.python-version }}

	# Run tests
	# Ignores:
	# * Nsynth is run in isolation due to dependency conflict (crepe).
	# * Lsun tests is disabled because the tensorflow_io used in open-source
	# is linked to static libraries compiled again specific TF version, which
	# makes test fails with linking error (libtensorflow_io_golang.so).
	# * imagenet2012_corrupted requires imagemagick binary.
	# * import_without_tf_test.py, because the test relies on TensorFlow not being imported.
	# * github_api is run separately to not overuse API quota.
	# * wmt is run separately to avoid worker hanging.
	# * Huggingface requires `datasets` library.
	- name: Run core tests
	run: \|
	pytest --durations=100 -vv -n auto --shard-id=$((${{ matrix.shard-id }} - 1)) --num-shards=${{ env.PYTEST_NUM_SHARDS }} \
	--ignore="tensorflow_datasets/datasets/nsynth/nsynth_dataset_builder_test.py" \
	--ignore="tensorflow_datasets/image/lsun_test.py" \
	--ignore="tensorflow_datasets/datasets/imagenet2012_corrupted/imagenet2012_corrupted_dataset_builder_test.py" \
	--ignore="tensorflow_datasets/scripts/documentation/build_api_docs_test.py" \
	--ignore="tensorflow_datasets/import_without_tf_test.py" \
	--ignore="tensorflow_datasets/core/github_api/github_path_test.py" \
	--ignore="tensorflow_datasets/translate/wmt19_test.py" \
	--ignore="tensorflow_datasets/core/dataset_builders/huggingface_dataset_builder_test.py" \
	--ignore="tensorflow_datasets/core/utils/huggingface_utils_test.py"

	# Run tests without any pytest plugins. The tests should be triggered for a single shard only.
	- name: Run leftover tests
	if: ${{ matrix.shard-id == 1 }}
	uses: nick-fields/retry@v2
	with:
	timeout_minutes: 1
	max_attempts: 2
	retry_on: timeout
	command: \|
	pytest -vv -o faulthandler_timeout=10 tensorflow_datasets/translate/wmt19_test.py

	huggingface-pytest-job:
	needs: activate-tests
	if: ${{ needs.activate-tests.outputs.status }}

	# HuggingFace tests need to be run separately because they're disabled without installed
	# `datasets` library.
	name: 'HuggingFace Python 3.10 tests'
	runs-on: ubuntu-latest
	timeout-minutes: 30

	steps:
	- uses: actions/checkout@v3
	- uses: ./.github/actions/setup
	with:
	tf-version: tensorflow
	python-version: '3.10'
	extras: huggingface

	- name: Run HuggingFace tests
	run: \|
	pytest -vv -n auto \
	tensorflow_datasets/core/dataset_builders/huggingface_dataset_builder_test.py \
	tensorflow_datasets/core/utils/huggingface_utils_test.py

	githubapi-pytest-job:
	needs: activate-tests
	if: ${{ needs.activate-tests.outputs.status }}

	name: 'Github API tests'
	runs-on: ubuntu-latest
	timeout-minutes: 30

	steps:
	- uses: actions/checkout@v3
	- uses: ./.github/actions/setup
	with:
	tf-version: tensorflow

	- name: Run Github API tests
	run: pytest --durations=100 -vv -n auto tensorflow_datasets/core/github_api/github_path_test.py
	env:
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

	notebook-test-job:
	needs: activate-tests
	if: ${{ needs.activate-tests.outputs.status }}

	name: 'Notebook tests'
	runs-on: ubuntu-latest
	timeout-minutes: 30

	steps:
	- uses: actions/checkout@v3
	- uses: ./.github/actions/setup
	with:
	tf-version: tensorflow
	use-cache: false

	# Test each notebook sequentially.
	- name: Run notebook
	run: \|
	ipython kernel install --user --name tfds-notebook
	for notebook in docs/*ipynb
	do
	# These notebooks time out because they rely on loading huge datasets.
	if [[ "$notebook" != "docs/determinism.ipynb" ]] && \
	[[ "$notebook" != "docs/dataset_collections.ipynb" ]]
	then
	jupyter nbconvert \
	--ExecutePreprocessor.timeout=600 \
	--ExecutePreprocessor.kernel_name=tfds-notebook \
	--to notebook \
	--execute $notebook && \
	pip install tensorflow # reinstall tensorflow if it was uninstalled
	fi
	done

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

add lazy imports to dataset_utils.py #9052

Workflow file

add lazy imports to dataset_utils.py #9052

Jobs

Run details

Workflow file for this run